From 18a2543ee0575202a1c51627a96f2637e2ba6981 Mon Sep 17 00:00:00 2001 From: Jochen Sieg Date: Mon, 9 Sep 2024 13:04:00 +0200 Subject: [PATCH 1/6] mol2morgan_fingerprint: remove deprecated fp function - Remove AllChem.GetMorganFingerprintAsBitVect from the code because it is deprecated. - Add a test to ensure the bit2atom mapping works as intended. --- molpipeline/mol2any/mol2morgan_fingerprint.py | 13 ++++++------ .../test_mol2morgan_fingerprint.py | 21 +++++++++++++++++++ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/molpipeline/mol2any/mol2morgan_fingerprint.py b/molpipeline/mol2any/mol2morgan_fingerprint.py index 1c93295d..79fa46c1 100644 --- a/molpipeline/mol2any/mol2morgan_fingerprint.py +++ b/molpipeline/mol2any/mol2morgan_fingerprint.py @@ -151,12 +151,11 @@ def _explain_rdmol(self, mol_obj: RDKitMol) -> dict[int, list[tuple[int, int]]]: dict[int, list[tuple[int, int]]] Dictionary with bit position as key and list of tuples with atom index and radius as value. """ - bit_info: dict[int, list[tuple[int, int]]] = {} - _ = AllChem.GetMorganFingerprintAsBitVect( - mol_obj, - self.radius, - useFeatures=self._use_features, - bitInfo=bit_info, - nBits=self._n_bits, + fp_generator = self._get_fp_generator() + additional_output = AllChem.AdditionalOutput() + additional_output.AllocateBitInfoMap() + _ = fp_generator.GetSparseFingerprint( + mol_obj, additionalOutput=additional_output ) + bit_info = additional_output.GetBitInfoMap() return bit_info diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index 3a5e94a9..14ff2282 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -128,6 +128,27 @@ def test_setter_getter_error_handling(self) -> None: } self.assertRaises(ValueError, mol_fp.set_params, **params) + def test_bit2atom_mapping(self) -> None: + """Test that the mapping from bits to atom weights works as intended.""" + # lower n_bit values, e.g. 2048, will lead to a bit clash during folding, + # for the test smiles "NCCOCCCC(=O)O". + # We want no folding clashes in this test to check the correct length + # of the bit-to-atom mapping. + n_bits = 2100 + sparse_morgan = MolToMorganFP(radius=2, n_bits=n_bits, return_as="sparse") + dense_morgan = MolToMorganFP(radius=2, n_bits=n_bits, return_as="dense") + explicit_bit_vect_morgan = MolToMorganFP( + radius=2, n_bits=n_bits, return_as="explicit_bit_vect" + ) + + smi2mol = SmilesToMol() + for test_smi in test_smiles: + for fp_gen in [sparse_morgan, dense_morgan, explicit_bit_vect_morgan]: + mol = smi2mol.transform([test_smi])[0] + fp = fp_gen.transform([mol]) + mapping = fp_gen.bit2atom_mapping(mol) + self.assertEqual(np.sum(fp), len(mapping)) # type: ignore + if __name__ == "__main__": unittest.main() From 91152e188016e5da4129ed5d2806c654cea60031 Mon Sep 17 00:00:00 2001 From: frederik-sandfort1 <129401811+frederik-sandfort1@users.noreply.github.com> Date: Tue, 10 Sep 2024 13:23:48 +0200 Subject: [PATCH 2/6] Error handling sanitize bug (#85) * fix molsanitize exception error catching * linting * isort on other stuff --- molpipeline/estimators/chemprop/models.py | 6 +-- molpipeline/pipeline/_molpipeline.py | 2 +- .../test_chemprop/test_chemprop_pipeline.py | 2 +- test_extras/test_chemprop/test_models.py | 4 +- tests/test_elements/test_error_handling.py | 46 ++++++++++++++++++- 5 files changed, 51 insertions(+), 9 deletions(-) diff --git a/molpipeline/estimators/chemprop/models.py b/molpipeline/estimators/chemprop/models.py index b94bcb02..e720e029 100644 --- a/molpipeline/estimators/chemprop/models.py +++ b/molpipeline/estimators/chemprop/models.py @@ -15,9 +15,7 @@ try: from chemprop.data import MoleculeDataset, build_dataloader - from chemprop.nn.predictors import ( - BinaryClassificationFFNBase, - ) + from chemprop.nn.predictors import BinaryClassificationFFNBase from lightning import pytorch as pl except ImportError as error: logger.error( @@ -31,9 +29,9 @@ MPNN, BinaryClassificationFFN, BondMessagePassing, + MulticlassClassificationFFN, RegressionFFN, SumAggregation, - MulticlassClassificationFFN, ) from molpipeline.estimators.chemprop.neural_fingerprint import ChempropNeuralFP diff --git a/molpipeline/pipeline/_molpipeline.py b/molpipeline/pipeline/_molpipeline.py index 3ddb7c9b..8ff43eb5 100644 --- a/molpipeline/pipeline/_molpipeline.py +++ b/molpipeline/pipeline/_molpipeline.py @@ -349,7 +349,7 @@ def transform_single(self, input_value: Any) -> Any: elif isinstance(p_element, FilterReinserter): iter_value = p_element.transform_single(iter_value) except MolSanitizeException as err: - return InvalidInstance( + iter_value = InvalidInstance( p_element.uuid, f"RDKit MolSanitizeException: {err.args}", p_element.name, diff --git a/test_extras/test_chemprop/test_chemprop_pipeline.py b/test_extras/test_chemprop/test_chemprop_pipeline.py index 646ac99c..32c4e677 100644 --- a/test_extras/test_chemprop/test_chemprop_pipeline.py +++ b/test_extras/test_chemprop/test_chemprop_pipeline.py @@ -22,8 +22,8 @@ from molpipeline.estimators.chemprop.models import ( ChempropClassifier, ChempropModel, - ChempropRegressor, ChempropMulticlassClassifier, + ChempropRegressor, ) from molpipeline.mol2any.mol2chemprop import MolToChemprop from molpipeline.pipeline import Pipeline diff --git a/test_extras/test_chemprop/test_models.py b/test_extras/test_chemprop/test_models.py index 9afaf111..57a434b4 100644 --- a/test_extras/test_chemprop/test_models.py +++ b/test_extras/test_chemprop/test_models.py @@ -28,10 +28,10 @@ # pylint: disable=relative-beyond-top-level from test_extras.test_chemprop.chemprop_test_utils.compare_models import compare_params from test_extras.test_chemprop.chemprop_test_utils.constant_vars import ( - NO_IDENTITY_CHECK, - DEFAULT_SET_PARAMS, DEFAULT_BINARY_CLASSIFICATION_PARAMS, DEFAULT_MULTICLASS_CLASSIFICATION_PARAMS, + DEFAULT_SET_PARAMS, + NO_IDENTITY_CHECK, ) from test_extras.test_chemprop.chemprop_test_utils.default_models import ( get_chemprop_model_binary_classification_mpnn, diff --git a/tests/test_elements/test_error_handling.py b/tests/test_elements/test_error_handling.py index 535b256a..dd134f48 100644 --- a/tests/test_elements/test_error_handling.py +++ b/tests/test_elements/test_error_handling.py @@ -4,12 +4,16 @@ from typing import Any import numpy as np -from rdkit import RDLogger +from rdkit import Chem, RDLogger +from rdkit.Chem.rdchem import MolSanitizeException from sklearn.base import clone from molpipeline import ErrorFilter, FilterReinserter, Pipeline, PostPredictionWrapper +from molpipeline.abstract_pipeline_elements.core import MolToMolPipelineElement from molpipeline.any2mol import SmilesToMol +from molpipeline.any2mol.auto2mol import AutoToMol from molpipeline.mol2any import MolToMorganFP, MolToRDKitPhysChem, MolToSmiles +from molpipeline.utils.molpipeline_types import OptionalMol, RDKitMol from tests.utils.mock_element import MockTransformingPipelineElement rdlog = RDLogger.logger() @@ -247,3 +251,43 @@ def test_replace_mixed_datatypes_expected_failures(self) -> None: self.assertRaises(ValueError, pipeline.fit, test_values) self.assertRaises(ValueError, pipeline.transform, test_values) self.assertRaises(ValueError, pipeline2.fit_transform, test_values) + + def test_molsanitize_error(self) -> None: + """Test if MolSanitizeException is caught and catched by ErrorFilter.""" + + class DummyMolSanitizeExc(MolToMolPipelineElement): + """MolToMolPipelineElement with dummy molsanitize exception.""" + + def pretransform_single(self, value: RDKitMol) -> OptionalMol: + """Dummy Mol. + + Parameters + ---------- + value: RDKitMol + Molecule. + + Returns + ------- + OptionalMol + Molecule. + """ + if Chem.MolToSmiles(value) == "c1ccccc1": + raise MolSanitizeException("This is a dummy exception.") + return value + + pipeline = Pipeline( + [ + ("autotosmiles", AutoToMol()), + ("atomneutralizer", DummyMolSanitizeExc()), + ("moltosmiles", MolToSmiles()), + ("errorfilter", error_filter := ErrorFilter()), + ( + "filterreinserter", + FilterReinserter.from_error_filter(error_filter, None), + ), + ], + n_jobs=-1, + ) + + result = pipeline.transform(["c1ccccc1", "CCCCCCC", "c1cc"]) + self.assertEqual(result, [None, "CCCCCCC", None]) From 85cacba32450c39a8cf57e26cdfbc78aaa28ba5f Mon Sep 17 00:00:00 2001 From: frederik-sandfort1 <129401811+frederik-sandfort1@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:00:29 +0200 Subject: [PATCH 3/6] Inchitomol (#86) * fix molsanitize exception error catching * linting * isort on other stuff * add inchitomol element --- .../any2mol/string2mol.py | 64 ++++++++++++++++++- molpipeline/any2mol/__init__.py | 2 + molpipeline/any2mol/auto2mol.py | 4 +- molpipeline/any2mol/inchi2mol.py | 27 ++++++++ molpipeline/any2mol/smiles2mol.py | 36 ++--------- .../test_any2mol/test_auto2mol.py | 28 ++++++++ 6 files changed, 128 insertions(+), 33 deletions(-) create mode 100644 molpipeline/any2mol/inchi2mol.py diff --git a/molpipeline/abstract_pipeline_elements/any2mol/string2mol.py b/molpipeline/abstract_pipeline_elements/any2mol/string2mol.py index cc1f5c53..9bd6ac75 100644 --- a/molpipeline/abstract_pipeline_elements/any2mol/string2mol.py +++ b/molpipeline/abstract_pipeline_elements/any2mol/string2mol.py @@ -4,8 +4,11 @@ import abc -from molpipeline.abstract_pipeline_elements.core import AnyToMolPipelineElement -from molpipeline.utils.molpipeline_types import OptionalMol +from molpipeline.abstract_pipeline_elements.core import ( + AnyToMolPipelineElement, + InvalidInstance, +) +from molpipeline.utils.molpipeline_types import OptionalMol, RDKitMol class StringToMolPipelineElement(AnyToMolPipelineElement, abc.ABC): @@ -43,3 +46,60 @@ def pretransform_single(self, value: str) -> OptionalMol: OptionalMol RDKit molecule if representation was valid, else InvalidInstance. """ + + +class SimpleStringToMolElement(StringToMolPipelineElement, abc.ABC): + """Transforms string representation to RDKit Mol objects.""" + + def pretransform_single(self, value: str) -> OptionalMol: + """Transform string to molecule. + + Parameters + ---------- + value: str + string representation. + + Returns + ------- + OptionalMol + Rdkit molecule if valid string representation, else None. + """ + if value is None: + return InvalidInstance( + self.uuid, + f"Invalid representation: {value}", + self.name, + ) + + if not isinstance(value, str): + return InvalidInstance( + self.uuid, + f"Not a string: {value}", + self.name, + ) + + mol: RDKitMol = self.string_to_mol(value) + + if not mol: + return InvalidInstance( + self.uuid, + f"Invalid representation: {value}", + self.name, + ) + mol.SetProp("identifier", value) + return mol + + @abc.abstractmethod + def string_to_mol(self, value: str) -> RDKitMol: + """Transform string representation to molecule. + + Parameters + ---------- + value: str + string representation + + Returns + ------- + RDKitMol + Rdkit molecule if valid representation, else None. + """ diff --git a/molpipeline/any2mol/__init__.py b/molpipeline/any2mol/__init__.py index 5b8b2da3..c4dabadd 100644 --- a/molpipeline/any2mol/__init__.py +++ b/molpipeline/any2mol/__init__.py @@ -2,6 +2,7 @@ from molpipeline.any2mol.auto2mol import AutoToMol from molpipeline.any2mol.bin2mol import BinaryToMol +from molpipeline.any2mol.inchi2mol import InchiToMol from molpipeline.any2mol.sdf2mol import SDFToMol from molpipeline.any2mol.smiles2mol import SmilesToMol @@ -9,5 +10,6 @@ "AutoToMol", "BinaryToMol", "SmilesToMol", + "InchiToMol", "SDFToMol", ] diff --git a/molpipeline/any2mol/auto2mol.py b/molpipeline/any2mol/auto2mol.py index b33ee2d8..925b7c95 100644 --- a/molpipeline/any2mol/auto2mol.py +++ b/molpipeline/any2mol/auto2mol.py @@ -9,6 +9,7 @@ InvalidInstance, ) from molpipeline.any2mol.bin2mol import BinaryToMol +from molpipeline.any2mol.inchi2mol import InchiToMol from molpipeline.any2mol.sdf2mol import SDFToMol from molpipeline.any2mol.smiles2mol import SmilesToMol from molpipeline.utils.molpipeline_types import OptionalMol, RDKitMol @@ -29,6 +30,7 @@ def __init__( uuid: Optional[str] = None, elements: tuple[AnyToMolPipelineElement, ...] = ( SmilesToMol(), + InchiToMol(), BinaryToMol(), SDFToMol(), ), @@ -44,7 +46,7 @@ def __init__( uuid: str, optional (default=None) Unique identifier of PipelineElement. elements: tuple[AnyToMol, ...], optional (default=(SmilesToMol(), - BinaryToMol(), SDFToMol())) + InchiToMol(), BinaryToMol(), SDFToMol())) Elements to try to transform the input to a molecule. """ super().__init__(name=name, n_jobs=n_jobs, uuid=uuid) diff --git a/molpipeline/any2mol/inchi2mol.py b/molpipeline/any2mol/inchi2mol.py new file mode 100644 index 00000000..4c881843 --- /dev/null +++ b/molpipeline/any2mol/inchi2mol.py @@ -0,0 +1,27 @@ +"""Classes ment to transform given inchi to a RDKit molecule.""" + +from rdkit import Chem + +from molpipeline.abstract_pipeline_elements.any2mol.string2mol import ( + SimpleStringToMolElement, +) +from molpipeline.utils.molpipeline_types import RDKitMol + + +class InchiToMol(SimpleStringToMolElement): + """Transforms Inchi to RDKit Mol objects.""" + + def string_to_mol(self, value: str) -> RDKitMol: + """Transform Inchi string to molecule. + + Parameters + ---------- + value: str + Inchi string. + + Returns + ------- + RDKitMol + Rdkit molecule if valid Inchi, else None. + """ + return Chem.MolFromInchi(value) diff --git a/molpipeline/any2mol/smiles2mol.py b/molpipeline/any2mol/smiles2mol.py index 79db23bd..0d7c45e6 100644 --- a/molpipeline/any2mol/smiles2mol.py +++ b/molpipeline/any2mol/smiles2mol.py @@ -5,16 +5,15 @@ from rdkit import Chem from molpipeline.abstract_pipeline_elements.any2mol.string2mol import ( - StringToMolPipelineElement as _StringToMolPipelineElement, + SimpleStringToMolElement, ) -from molpipeline.abstract_pipeline_elements.core import InvalidInstance -from molpipeline.utils.molpipeline_types import OptionalMol, RDKitMol +from molpipeline.utils.molpipeline_types import RDKitMol -class SmilesToMol(_StringToMolPipelineElement): +class SmilesToMol(SimpleStringToMolElement): """Transforms Smiles to RDKit Mol objects.""" - def pretransform_single(self, value: str) -> OptionalMol: + def string_to_mol(self, value: str) -> RDKitMol: """Transform Smiles string to molecule. Parameters @@ -24,30 +23,7 @@ def pretransform_single(self, value: str) -> OptionalMol: Returns ------- - OptionalMol + RDKitMol Rdkit molecule if valid SMILES, else None. """ - if value is None: - return InvalidInstance( - self.uuid, - f"Invalid SMILES: {value}", - self.name, - ) - - if not isinstance(value, str): - return InvalidInstance( - self.uuid, - f"Not a string: {value}", - self.name, - ) - - mol: RDKitMol = Chem.MolFromSmiles(value) - - if not mol: - return InvalidInstance( - self.uuid, - f"Invalid SMILES: {value}", - self.name, - ) - mol.SetProp("identifier", value) - return mol + return Chem.MolFromSmiles(value) diff --git a/tests/test_elements/test_any2mol/test_auto2mol.py b/tests/test_elements/test_any2mol/test_auto2mol.py index 06726a24..9cbad60b 100644 --- a/tests/test_elements/test_any2mol/test_auto2mol.py +++ b/tests/test_elements/test_any2mol/test_auto2mol.py @@ -17,6 +17,9 @@ SMILES_CL_BR = "NC(Cl)(Br)C(=O)O" SMILES_METAL_AU = "OC[C@H]1OC(S[Au])[C@H](O)[C@@H](O)[C@@H]1O" +INCHI_BENZENE = "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H" +INCHI_CHLOROBENZENE = "InChI=1S/C6H5Cl/c7-6-4-2-1-3-5-6/h1-5H" + # SDF with gzip.open(TEST_DATA_DIR / "P86_B_400.sdf.gz") as file: SDF_P86_B_400 = file.read() @@ -82,6 +85,31 @@ def test_auto2mol_for_smiles(self) -> None: ) del log_block + def test_auto2mol_for_inchi(self) -> None: + """Test molecules can be read from inchi automatically.""" + + test_inchis = [INCHI_BENZENE, INCHI_CHLOROBENZENE] + expected_mols = [MOL_BENZENE, MOL_CHLOROBENZENE] + + pipeline = Pipeline( + [ + ( + "Auto2Mol", + AutoToMol(), + ), + ] + ) + log_block = rdBase.BlockLogs() + actual_mols = pipeline.fit_transform(test_inchis) + self.assertEqual(len(test_inchis), len(actual_mols)) + self.assertTrue( + all( + Chem.MolToInchi(smiles_mol) == Chem.MolToInchi(original_mol) + for smiles_mol, original_mol in zip(actual_mols, expected_mols) + ) + ) + del log_block + def test_auto2mol_for_sdf(self) -> None: """Test molecules can be read from sdf automatically.""" From 1f83de3df4ffe7547252a60936f29bf4eb5f5254 Mon Sep 17 00:00:00 2001 From: "Christian W. Feldmann" <128160984+c-w-feldmann@users.noreply.github.com> Date: Tue, 10 Sep 2024 16:34:39 +0200 Subject: [PATCH 4/6] Add `--check-only` flag to isort (#88) --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 5ff97d5f..8ff938f3 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -150,7 +150,7 @@ jobs: pip install isort - name: Analysing the code with isort run: | - isort --profile black . + isort --profile black --check-only . test_basis: needs: From 4bd1dfbd54309d56d1c18b1faad2b50cb9867827 Mon Sep 17 00:00:00 2001 From: JochenSiegWork <135010976+JochenSiegWork@users.noreply.github.com> Date: Wed, 18 Sep 2024 10:28:37 +0200 Subject: [PATCH 5/6] docu: update readme, add image, add feature calc notebook (#90) * docu: update readme, add image, add feature calc notebook * readme: add feature calculation example * readme: add published Molpipeline paper link * notebooks: add header to feature calculation --- .github/molpipeline.png | Bin 0 -> 36910 bytes README.md | 83 ++- notebooks/04_feature_calculation.ipynb | 915 +++++++++++++++++++++++++ 3 files changed, 984 insertions(+), 14 deletions(-) create mode 100755 .github/molpipeline.png create mode 100644 notebooks/04_feature_calculation.ipynb diff --git a/.github/molpipeline.png b/.github/molpipeline.png new file mode 100755 index 0000000000000000000000000000000000000000..fc6129a282fd993069d0b2110d9ea607778e41f7 GIT binary patch literal 36910 zcmeFYc{r4P`#-K5H{B9iNJ3I+v9ywPvbUkK?{{Py#%?CtC@Bg_iXvMo3`SxMhEb_h zmQjph#>^zkU<@*5h8bgim+F4L_w&0w&+q%!?|U5I<8vKHM~Att>wR74_Bzk=b-u4# z=dCRz|JeSAkdTn%=~HGGg@ii zCwuy2O3H6u&6%>ts!UGG7)$OlzFU+cx!qV)>-%lFwA8OjPQ?d8r5&aaDBc2&&gj36@1<$fbQD9FjicRcT zef7HLs&AzSDRwobPfX3G4_O|dxO~5TTg_1fI0+nAwTCBeCl#cmuvX5Rzd8Tw#}7;4 z?*x@)nkzCgVcmfTu|>Ono)9v7kGwtQr7R;u&o5uRcl+0`((AQY_FBG?&k$bzy;>94Lg3l z)%`6|hjCSNDJiLC>__FlemsA%??!X;oTZtW$zRIdc0q1_{tckR^~S%R?Pccvp+h3U z%xt&Fsw)9UicFM>me#V2Os4dfpTF$1`2Tw$44_dLk7djB)LhR+;#(x04Ay^)8{jOmkjFriLP@`~aEzbt3bU@x(9Z z=lH9hq&oa1&#rd`;X$g8`)+`Cd6fo6Tn(2n_<74oh0w@%R$G*+6Hg@jIuJ$YjyulfE7 zLT&4>R}vD^lin)(DfFTA<6Azbq~!q?q!Dgiph^)c<+K|2wqk0${qw zh3~}Mn2UmEPxP)yj=tyV^frf4dOruNDRke|CqQ;`A6g z0}nLU&j2;9Mqr(izF$3atXXV}rn!WN#umoJ$%Q>zHMVx^-`H53|JB(^0=Vd1c%-?B z&D;4vIoG7stWx#*_c`aDvOAxBzI4+y${72Cb|;f>BDGIBqh9m}_lf~e^;!&nw+*Xq z{??GVG%vB5c1|81sL`p9@RbOHgkE;_Rx{I9cZ6JZs%w-auA~F!{SQN%?b|Pzska{8 zs_O}4`ww#Z{qN(tgJ=>^f}^Wfh&5)Y+n0G`er`9ge%uS0qN&Byl<)q%r-7I`X*9r? zmEDE#4n0S7|2A7a;4mQGb8g+l{8$H>%c7Gdb%wsy2a~7jA*8*9i}Ifo^`X1zhqW-% zN6_J8X9l&3yJFuOtcrH2Cgggfs8WsLveRnn5>J&pQujJ~Y5?=fzQSRTa}KTrIy1Dt zwniEn71dQuo2saW1`?1_Vmm^wPCKSUXQ&RslALIX<=o?eLu>|V?54U4@0k-iIDgtN zU3U5i4Zy0C+om6aZ5}(c4~_2rgx%M|CI+63=)O8|i?}nfD^CX_si56(UCPJ_?qMr` z40s3Wf)>heA+%0tn$osyciX=|A=d@;V!P$eNkoyn4&)y^fcR~*l;J|pkY%hk;zn_H zZUx*iV4W6l=dlb!XBWl5EdAr^S1z;%ng!)UulZb;@1tqsk*kQba|2s?cT}^LTT4vT zkzKZmwXTzki&wH=-;}aT+s+$@eRd_P|EYfVNXtWc>b`L?;1++SrFZ4=dHJ*|{gi>n zP;8K`OF1sY{PzvOYDuqOmYN^c#(1AmEbVwo-doM}7<;s}WK3Z@JCtjrNOQNg24Y=>;DtB!f>XP>$Mvm;tw#-AC#KR8~S-=uP$J$8;5FiY}rj=?u9)67^)R;v*+ z-hlEVAMSsieqdxr5ieZXc5=YKXX0*&uGhJ;3UbG?C|d*OYVZ+&@n%-mb|We@@JbV_(xn&QshW36OBu6` zhuZ?~UlcZK+U_0uwPF5chh{~^!Q+ViYGZDq4v!D$X`_P%FFLw{?=xkA8}Exm1wCl@ zGE*;{V&<*pm2_|6nW;O~+U_NKO6{XScuY;5LG9Nrx#ak+X_vZzoq79n!y;q6<9FrN zZ2$AMW&bAcjz={Ra+AGMhW8hxsi)wIMXjwR#ztHAvzJa**t4Gfyk`Vqu<`HrD1b1L$p%X8&k-8Ow~yLZ1c^L5aKeH!9|}abkpG4#w+E|O@UPco7H*~AsTLL%p1-mq zk9}3m-t(RR`TU%B?-2;C)hxW-RMBxPY@19+DR=q#>QhHn6)f9cwz|9>&8MaYF?Dv; zxbM%m+eh!PcspNi)Vi9gdrnLZ9LRWg7y&+;{yRR}Ij9oBUCqKphx@|pm6By@;t;ry z)l_}#j$ZvI@31rk@nVT_l!vTB1p@oS#{m34P` zdE4W2e&||7xw0XO4x9rX@9*hQ_P2#c)>t@A>_ESwYc-Y&{&7cxPe*nWE_mvk`KpF? zM8!R5AK7M)dS-Ebs##sz!9Mv)JhP;vY3ul;}i49%V)^Vd;}IHFb!RxUx*_x)viTm?~e*RR9Z-pASlZ)9N) zDgRugr` z9>M-MH)%!~{`G7x2mA3+dxp0iK6|?)g86d5$(|K$AN$v{-&P_zKUNU`%^R5r_FvCV zakhVy<%r7oyXyZ(a6t9{WAXoSb;QpY_iE9HPKUapgRO=d!b&jHFFU4&fI28H4~f)l z++k++w&mmZ-(|e*{sRWQB?t4^j9@;O6+HIo!Th#d{Ky@B&Bi~;jS;18QoD9R>vx1m z*C^WY_N<98(3c}4Bf$(JYQABx5*mi+zXi0)YV8VUL27vHhczQRzta5#!rONrkbid( zQi03&{Jb3gA1og5WIdS&fsOictTxhSb=Pk**|hb`Us zZFsp+n~KK#l{WYW89Z(9KM0}car3_PmWmecR0VmR*p8hEj@$7T9uP{iGg3;XqlCMB zv#xAJl6Tnb7~b!{uh9afmvR-?kM&o%j>X8->_Ffkr&3Q)UMQQIRdYAQu7$ldS@!K6 zcQ++I31E&h0+G$Ws7ChmWG|SXG7y8OX+Mot4jIT$@j^`-@_LA){Umex$CQ{FC6teHPHn%FiW!KrOyawu}>;wi{4Um zG$Vq5M4KqZOcaNclGqI+ce3b~Z~H5ZYE`;;Kg?BOU+DQyKTYn@bh@p#&x;3^qiH8P z@$cN|f1`m}bl`t*2A{B7Z)(I z&jq!cU*I{@urqw*jW5CPr1)kX7O0*5)(7lCVKem=N17cEIwCvyoK3E!F*s9~NO5p$ z1!MW$Q-%%%q|}S;Ex1w%{&*JCVayViajoDs-#mu<;a-*Y@z9okq;G0vV1K3ai;WUB z(?ZjZ^E?d4^Ax{4 zG3{|k4-dH^%O-A-^=Wl-Xm2t=jZ{pD?W%Y9|CClfP zlB{`*zy+RS8mtDxTNk_^;h(|SfnGzW!`*kODt`SBVhYz@p* ztXhE7y^Eyh8(dPlci1yZW2#MEE{iwCErkyzTYnjss!ydIF&wCznA@4*y>FLxHK5wM ze3SElHOCy+RBajx{pP5jk~YHd%5yGke}-#Dhru6pL~A80h)s_p>FK#6#1_8YY{aS5 z4aRt`zI~oegSzUi+Xa;Cb3*?+0y!xhDA!JLuEOr2j7Qr-a%p10%66O*1Ng+mId(7d<`+J-#_C(Uf*Jyv5l#T;RT`DdtxSeTsO z5Ky4vA2fVYjUubu7AZP@Q&Qn?%UKsl9ZO{oLhzBY<~GdNZZRoicEFOkv>yP_zxJx{ z)m6&e?>jf#m~fvSKEggKz--YIN!Qr%;zET_E>1H<Rt1}=a0Ey9srii} z=Hh>?-=N(hs6X31lJmWz8Ui*Y`l9ht%zbYL#sl|=KtO;o zFPiksyYw-#p%Pe^1SRMfIscTJ@>O(W{y?|DHPKC{MAVJv0*8o$(Jct*0Jfhpbxu5& zY~8Sj0=7ft*qijQZCh|EdqgzQ<|4)B-Lai*Bq$L$SG&kva&qxY+ zQ)OWv>-@Ku!~oq|qm`*k-%y`EZ07#|77FXAs_gU4l&;fFZ8#C~z#$LO*a>wJ*Xt$C zLT#)*7NSz8;y3PB@n1rDTT__0sHmthquZa9+@AEOr;MGC5B^&MYFpc@J9nx0M<89% z*8P1Oi65I**FPh!SIZ303G2uYo~UC5FXSpWlv^snYE9c-LZ(-}FsUf4!MAIBqq`}X zX+Sc4)W^+>Vol-0^HZFZ@P0Z^&3CNY^NC~po&T(%@j8YV^H6z{S3md@eYc(s$w(wW zH@sH(I#RHRLUM|{-Dh8p1$zztL5LmQA3PmPzzlvq%x_Au2Owg-@km(jbE|OD&A#7~ z#yWC3SUQLn4%j`#NfS>49aFi>;!Z>?p=7J)S|j2(&H8#ZEwac&H5 z2B<4t({;EhTLXrBzR_-l?3aLSd1VEfPa+u_YUIe2{M5JsQSK;EL*6CnJZ0~>%ODVt z?xeXKW7(4f%W<9g$=fofY5|zJqq7$%X6y_I6r}T=Q71cLOXsSAb}{@gPk`cB=poE>7tgHl?B=8*+0jqrdxS7 zdL%0fzldb0yVYkKf)CsE6z&DXDo zEqU7p`-8uK_RQauNQ{}shH8DgrWhT#?lwXrN!Dz0BG|7fhWxOLw^VTX{8W$bw%4c(WnJIrwUJkh3Y@*$@_Wt$7Cdi{QteY<4KiBkXMRj~3TT@lm^ zrlR9AMMIzQUQ{WD5;C+Icjp6PbU5B-W5ab&(=(*9Y$t*p9pgt$(`VmloU83eF9*Nj zUvd&t%HNVmWIUdt71!y315>T^jU7?PE2Pxc8_R(&jrQ@SFxT+3hHG+IqeezcOo0tA zmIFWN##v{H$;8EXS&=y}-0r}}ak!?MYB&1t)_SBU<;bA$)MkygyJ4LBk#PI5N~-3>nGSo~3b+HXcVYAN;!p%lGTRdl|u_pF%K$MBPoL zrCD>46Ky#S*f#4kCJqt~6>aA+=2)J2yPbzXQKy&e_{nwjmw4oQi0j$l^o+4oSFYMv8wCt0{$x550J^00p1CRo+AZ zl6x*k<2%r{yPkJFH@;Q6+PT&ju9&|g(HcY^d!G=QMF)LP8!czxFFk#fwj4*HyUluH zKc~Y-^mIr%FL39-ScGjv@}KgmJMKibBv*!yfcR1F3a;G)Iy#tZu<8gyEqxXFt>9~A zp4I8%s6%y}54J;#xP%d3%l0!-sm(tMwe9bpOjXp!P!m%ID#4)57fsHasb5?ho$Fo2 z^-k!G88Nd!4Yd3VTdx$jY&9s*gXuSOqU6Xk!PwTbng^3ym8OxdeyEVviP)J0`2-8B zc3t4iN7+R9Obyr7LtjG#C37qKj?+uVmJ!=jW?an5cvl{c(0fZ|B1lk_hZV>W1pPeu4hJgnG&i$-5^uez+ zH|-Egty4|ay4UF8^#=)g!n3*rE5o@mlHu`|3Gu#NKg)?W=LOplQ{`0Ynp%6_qyHL* z?j}9A%H?58IOPZ}ZjTmC_`NW0h{+EiJ~$2hTb!;-A!lQ70_LRR&p-9tGe($-RU+7wy}u?x^m^U zp48&2E{PRLK?2KWNU+H2APbr=Ie$xNZGfnUH zd9P1Vn#ZvV@2{jjHw^sylJ-?cZ5ToB_fYh`Ly9ZZGU`uD- zrAV9L67ExVt33-ss(tB-rNtEbuIx%R^oy3!SN z_7=)lTltJ6yce!{(};$6RE2nRb+M!is0;us{9^+PB?Y=Q@!5jq0aUgi0ox#<#jAO@ zcCF#vsZu7Io%|R>8@3dSoAZTD+Q1r$Yi_8@hcs{oci*ooeHX&*vi-Q3RFSaKLY)a0 zONg{d;b!B=V{o+hq?Fb6ZTaW=BYKh(3|lMh3wiFFZWD3H6@1pr;}8f;jz95vRC{SO zs<&juQ<>ivCkn&E z)*uy_dA4*P!T6jVJGyI1yg1+AuKM6CrwOkzv4AWk4@iZC#)rhUDj@#Y+Vb%Oj@H^8 z)jEAehn-*8^r&E5>6>;ho^DY+Fv_J~LC~YW*POl^0N_#~pY)~&>rJ8WO>hJ`&M(*| zKubKA>*)!kWH*jyvr3~mmdg@;kU;wIWd&V0ws*F&8SeFRZpq{)KED{}@3E~qjB6YE zI5SYAsw;59Kq2HMumdBC{j5ZzJ-f93dW6fI7p#1M_U=xW3U;w>Qfs<jdnh1{*NO3)jk5M{JHtw7pI3RYl{AcFc3vohm1 z^j<;PW?VYIU@+oZ^&8v+xezNZgEZZdur1~TpF|FnO)3`9ROK*&BekW^1yNU$c{Ifw zjDUBo_8y2q#dmDW&2>MvTqi(l$7j{fDA-sv+QT$<$a9g9%`?1&T&Q=iUcApG% zExh)9wqz(nuLJBsdSh)3COqK8k`dB<K@naNP0_Bk5 zRQG2OH-vPGg>ne=qX|We-mW(!RC~^5Yk6u?JA6@N#QF7RFg3Y(w!VKmOkeCG>7W#Y z*habTWJyQlnh=<8M1o(%o6X{L9)pR%?ZuV#F8 zw)X#?)M4d2fSM%D27~f!2{GQ+NS)Ma%zG3ZR@RPHyUeqX&fhgfdv|1wIOWI%h z`Y$Mk=E{as1fy3mP`V~;B0AQh#6@2Mq_NGugPJ2)=w!`MLqOq1;(S4{?pwYx21{Sz zxD%QL4U<&D0Dt%pJ@6x^8KM`dBI2@H*SXlg7WA|tZI5#G)~N*+;DBQLUChk72#VJMER zhS3lfy-G8-1%HH#S8jSli3hb9CH-uFuP|5cl@ogkqZIIWMXFhZYUxdL}M?06jLK#2Bc^ zZr!4hR!c8w#=g2hY?|I)MYh=Mo5^=NVQ;lY7ytg{%_6zwubHC7-c`^*o-)Q_EYNiE zfv4f4A&=HSTLLQzPb6IDrPjia3ub-Wa4v$S@98rMl!awPPed!Cb@n?oBo3=jyOTVA z-6oG05i;yo0tJhM)wV_}yLm6}C0K}+VUy3tUng3tOmvRebOHTM6AWi~uI#{}V;alT zOR3&&)bFK@-{~QzJ#63h_A{=XF7d?nvkDj25%@VF=Mv8;JeuP9B%obKV?CMHI$^zx zsA{z9bUT)33d*NL?w8|9oSU4+Tb|@$-}VD)6T1D+fRH_h!ihlv` zfr}}8__FF96Sv5Nj1zWmkL^dbFZiN_Y^&5#*)A2X8t8otP6^r4esC!*e%z&{O0KsX zb$esac6%1+Ck=xqb=K|I{^NMYg+gc4p+1T&9u2+6cQQh_tW}BoY%ngI*dE0zoDu%; zsVRQ$Vax!RG)gzYRKOx%LpWz*f8#sn)vHOlTgtdxvR2 z`}l_vJVyVx$Z^JHJCu}9N=PA7yZW;(^vm;p!bX*{t`gmq5(W3!Pn?F=h|r$Aq(ZSB zV@;mXPZTQZ@+M~7@zP;R8&Gr;F$?G}c?m*1Wp$q2ILZw!!G zol&revjr|VN?m4|`JLsV95!aNt5A+P+j|KCo?C-v%|0xGO5RldDr_(y*20UuFLas3^zGSH&!uhZ7EVCg48 z6N~!gBLKQj=j-#C*te{f;T9w;Ga7^`wEg~d^Uw!-YM8B5w)ev4Q3z?g;5We(T8$as zPuUe>T*iuDv~}ELlU@z4N_Wd23NKYKq8&wY=J2O6#6wYi!I=qvr;>!JyQ+UsAvbk! z@n@@`!%w+ADbZBL^KWN!T|$m(j-Bw8tJS_}p;5ZQCSU+q1RxLO{^EFLDba(56ogperV4%wT9OeP!)lpD=6M|8qmmx5h$QAL^=|p__oK25rjhXdUKK=K+v=Lm-fMft;4mhV5X4K6}#Z}qnie0t*xcbxy)X-P=XG^Sy70L&5|#%A zArbe8I}q$Ge4@`~`yV|+SNkqYQ3Olpi3@a<9YuM&QkYWX0W!4aVs)_K^MQ`=)aof$ zpaH8b?Mb!$Me(d`V-|^wekFqc~gY9g`?=VHJSlD%J4A8$(bXZIbpB=Suhfp7pS@_HW z%sYjWyluxi_%F9n%4^zxGr)lDyTN!Lucp78JF&(bNgaq#V%E2KahFl?^JAHEE83Io zIw5%GHrZiZLDPVH+-%@I5FW?8Cmjh3?`Vo~-v^;6pePOjYO2xXriN-4pd=(}Oxtn{ zMt`TUWC*bz5Tn{KiFw&>UgNIy3pax(IQ`psO{yV&JTdab1M2C5->C6DKvUx!pW3|I z(@<2cp;>5(Jtqe94nBR5&~z=U+u9?9R1Wl)XZkE|vhzb@=I=fqta+B6=I{ezn3XFthc?ju2l@CM(0*lv{^biqW^}MqekI zTt^{T9MD=GSB4$M6d+^yeJxS|F^7zdZ^vpizQeL*qhqT?5vI!6-H%-R-VoH8r9TFo z`IP_#=F<1h=fg;*3;dM#1}QMQnh&=xl&ZTfJ3zV-6O~_8zxldwwc#UINY*Yp{%S|9 zjgQGg#Vl$6W>EcFsH%s()#q_cD*qux(;UGaU&|F@#kt9)$u!d*Z05I+8`bPNiQu!` z?_}!CJ&=}(NsnQ?I3#<;MS(>;pB?ZUv)TI8H0-jc+9jVqj8|*y^k8n-rXG9hdj^XM zXE(EV>x7L!f}dj?HA~5l7x9Y0b1j9TE!8o3bV`V3LnTGCq~lH~nWsE#p~*w!RmHcK zmXhCEq1BB2sF&3C*oyVtEQh8x8{Q8XeR7c8C!8_El#TvDf*(`1r_SG%jw}YZbUC%e zlQd40)vL=@`l^YEd5}C2kSsJp>o4nbeb97x<%hsU^_R*A1zVK=xw|0^=u(xJdVAu; z^$zLnA#EAEM5E>|jKn4tDJmq6^X*RM>04Ey0d`m!-zx2^dsuVT@uuXLzQEi$p4c#> zrxi-;ayxlTQ)tZRK={|^O3`kbCHX*7ZO6j%>7FcO0^hN{3zC6gc%w0D=nu*-4@4ib zW6&9w&Wz;2r+=rkbhDbLlXjm+!dpZu+Ss%}N6anHm3{`YvKlJy@l)bv?Rodrl5NHn zKX1A{LY&yWu8hzv3`YAMOk28STUvw9*9t`eTETiIyc5~`DUu)$ zL_eb)Vb5P{l9X*I65nt@L@_8#oSS=0gukMg@ON_ulj^Mic2+wh*9De|^lbHDYa}_p z$$6PnH2$Hq_1n$ts)thJ=VEuo;5@5y0mtWXItWqsdTmxdN_sBuA)v$4>3X3)o3?2^ zo{ca+^0HRI@M{c^k+lJFlN6Sw9o+@%OEELTaulzSV& z9d*7(v(8R=`vncq)0&$_`O;T&9lR?{MfAmNwbSwzqjcZu%_S zJ2vPYywU}~#(q{E??KZZF_>g`a#7nz?@Ox-SZ*_QH-B+BKOy^HuHL`mqR--RZnV2qe|b{1UDY-b$gIMI-1XEe0@|iyrnw2 zlui_+G`S9Ci7DH!>@b=Q*U>-ZB;Znm&jx4CMo)$@*Ol=DRLyvImh^R)3*TN}|JnRg zh$CkTZVpu=#*&Y{pL=C;-CF!)OI6(6V#A{HtokU~RHgh1t~4;*4??iKv|SD7@A92m zs+v1avf9_!&*H?EUbt4uPAV$d99y_qGsKhiy-|9hiHZkJm+V__%vn&WbLP2b!$*A> zG{dLbZ|tgS`&TX7(^z;`i9I5$EAc&O#s)8Y(Aoy`U3uv1?+%4P^8t_8w3E;+2S$H@ zwJtiX5?A;ZIP}VZWR<{VetT;dM4qe_ot3A!4G}o|i%AYNzBlEnqzUZ=o1vEPZc=)C zslr4gk7&#)?bQe`eBQ78mqex%#m_0z?R8apI=1oO*J>gRMOs<`cg+i8oa$+{HtkWS zzB|=pz0sHfjbd@{)*mVQaP;AeNs&IiKn);LBN8{}3zjb1jl^w(&8<%=l7Boo6;C?;`S7L=){aoSXXN1sbeopz@b6U4El~*g zmn^RF%^ygWyaIK=M)&7-Zgf4oy{?RJiqdxJ_$>?gxI~R7d3@_t!=bluUq~e+vk+39 z-?lYMzuYC^XYGW0Acwb<>X`tGgT(u+67uBV#CQAr$O-fSc^nx zIFPZ2G{Ex3ZooG$ZXp!nX=4v`>43H^-7b5N$Z?^HBC(b)!H5dRjPK{w#JgVZ{$|f#vqJ=o%e%F*BpnHUj$+QXEPK1& zgD+3d_>JOH0l>FWHo@Fq2_9;=ps9MlUeWct)t?Mtr%SA$Pw&CA^~ON{m@7q|u}I_| zLh_!qH-^Ptj3)Yid+$nqHCPA*GAvHA0D4W~3a&ADzB^VC0m({_ir+ zR-d*=;C@bZ=T(Q=azIA}%o-q2-)C2yve0mG@-v=%TRrRaV#DJIsQ*AZe9487@J-AO zV7Ed|OOdc%{esFs9TYQwkTAMmVqBB~nhes>H@ZORulU*b9>vMfK2=7@_}N#f0=AB= zOSL9>l4AV}C3&$0?d2BgUdQ|kJkL7>l5{fEsIH3_`uq>XHle3XPvK*Ly-h$O^5{e= zCNpG1Q4l5oRnXLd4EqZ1*R`q-prbd^9ns`|Rt20s!*%iG6G#RJBaB=2`H~?8iFp+z zsR=(^kYhMv(-f!_Z37J63)vxfO{w+9^sd;8vrlLWwGz7c3~6HhMY>z(m!u+F+u+4F z=|~LNx~CvV38WlO>>aj54vaPx#E!}I~e&&s3lE(_OeQ2c=S4HJH z(w2d34VHyLg~=S2+pJN9K<2d26e@uvs9+cG5vkIdm59c?=orcCd{>ux8qk#{Yj?$} z8x5buK28&1scwRZi~ysBT*1;_>9RBb3syCs@BOgK$M z5?o<&o>aOu`g2bEDLGWZU8vWUKU+7ad!&(O@5cutaG=}A78xY!N6hX)17#!Hs1?I3~T;7fi{6HW{tkSp=^ zGx?DC221v9gV7;s!?n++mzwTE(?K3`R~^KP&C|pIX$c#x!MZ9MQwlEnMYOHwj)m{k zTCLsmw{zcaW!fa`#Ck##0R|Wr0YYaM74nFQ7g7zi;MWxdAFHRbZw%%SF4fVELVbK< zl?sAv71rx57E69)J20$%mSkzr*Uz|FInd%K+|OxXLls8d+$#(CO>qPx_@;wsaWrL9 zNc%xl_jzK}NtxHz2mnC93xhz2e#JAW)nSUSZ57&{sA$tQEjE4BpJ#%aHqZdCoMwBU z)tTixp}_pL_}wEni!zp@oo?QNADajyP#u78=8j^a31n^spArK5NVu`!n6UC>!^swQ z1gi;n2$*S9rp3~px9DOj5G_nnR&)EDn$F$`I5)FE_Jft-!oV`lvYLxjsOB~ro`#2a zQ!r z7t9+y%z{C2b^YuL)>wdtTt&3C?Fi%@wut@sNFDnb(j{acUhgL$P#pYWH_KO!KT)jh z)0;mwI|smm_jrqPg+mo}trF(FbRB%b$A^DhFr(O=Ao4tV^ah$x84K9>41Z;_a^`R; z0gm*L1IF$>WmMdNNZ4M~wEMVQ~r`!q*MB!kb@_2$#%ouEF;^kH*2(4>OGrbseDmL_@ zAkgMaJ*&23V`1$4%4y~C*QeHL91%f%hMS8O`Fcd;mq*>GBew}-^mopxZ6eJ+c7YWj z#EOo27qt8Kb`$3Rcv9={c^3h{ab$F&*#*!2)sZ~uWTx&oI;*Qz0lECb#9=VwxL*OU z28?f7b9A`gSYvsIQ!B;Leee;#{N){QDmKh`py8o1zx(=Y%NrCyUy(i|mp95q#1B3K zMG9DIS`mWfD@OWdHIpiw(K+%ABAK$c%_gF)B^l$cx+GSURFs;>xpiiO;S^XPbp^bz zyWyD1#OPCnrMFv=v8|0Ha#m(h^@-0`Lq?n~!E|Su0(N0)bn^QO&0S29{vIh6o9b7p z%Noc+^4^7Wy<1l*nk?`R?YxB za+`|5DA8?J76hCeKfxymKmvv@o&7w%%aExcrS3%io|2 zaSGXcR(K4{ri7TiomoVU<+@U=Cv#%|@nYH~U~j^@iTWA(=n<2%;!O?aZlg4Lr_m*| zPZqS;gI3}$VH1hz!8;{K?qwEL_VXXn)^rSV+2XT`9SRy;!rf|u!rrY9ymJ1s5ZL9^g;B_Nhr%h# zl5sMpIpkWGR{{FtxNczqpv)iCU>>kFxyVCe(N5VU&#&h_yi%7r1k@aBYI1{oqV*6r zmVln09S@PHf_?>%?r2Nw#_fPA9kZF8#mSoDAckPtuiy7_91+s|Jq$g5meU{=$AEM4 zNidBoJHP0JkROQ@89l7~bD;n%IDNyG2aDwo?Be5Ix`qQbQ zf#jEmFu-1+f}!7L(dC9|;Ierm>BO5ytEKzi*kD2f-!W$K#OTy1PjMiwiFbytbv<&v zEHu@{b9e`4EL*aQWtd8$4t=I_oM((=JAsLO%VgkYyM_EoD zwj63D^RS|qCR&ZS<)1`(6LEQhDw+c)fyy-E%%U}50vx%tK@d+UkpBF}M&h`}T;)bk z0=w_sjo<5CZ`6OOk7BK z8xsUFkl(5o4WqX{x-&T0UI6St>*`vc8^i=Esq2B=dqgDtKLA@1DZc36s@*a5W5cD7 zX#39FRvmrHkX72ExWg{6y{p-Bm%RcwxT%>VTpU6Uv5QRr)cc5#qXGcVPpZ?S9t`p| zE}XJC#oP}009Q|ohL_Xr>1%!Qa8uma4{F3{@?7LDy@;sBE~bbMmv#y3DSlPBLpA6d zp@e?yUpmALF$$~D7Z8(&&UP=c67NQID-7jnVIp1&6rn3sGbHqJLPkQ&xZF%D6%K5Y z;ez1&a1Ip;C-Kp0t@S&1vR0Hi`I>Du3X*zkY#i73(Uf)jitG zx1B&yqP;0{PPOp#`0@hpMM0~_*xs227_tk1;Ks$XNP45?%+eAti`uM1c0j9l&MbJ_ zG|tHFco1YGk)x5Mq?<4+b=CIgmTFXdlW6f-{J3d`Nu;S#XvDS{;nW@BX~;3F~w@8cmIS>ev7rq1znE_D@ zv6~pp4>)yOQn!VFfxwwU<1$2045PW3sE-T6h?a7os{{5X5|Q;>FO?+57A2=pWWVRr zXzD#@2xBQsmFRyIHGQ5K4NNyH>a=X_0G|84H~*p?{g0pQnrR@7UA2KUZ7qSX?m(CGPu=F4yHqn;v;^=O9XQ{4mCsQ%PVQdmOw=`w z^5^u`2n7+z^<|}!e#dc{!BEQ6{=v$Bg_nP7 z!`5g|*~4AuCNhzLLp=I#4pGNBKKoeeMD2x}F_fDnLI;%skD_`L;QElkzuXMFs7kyB!;S9=!?Z?N$TE8|ABl8~(zT zQ;^W>E+L=CqbpE43w3({(GmUf7g5%0g~Hv;qErJJ_~B$6#_KhmIWL(rl=*uB3*D|f z_d3sgcJ65$Fuo?)dgct5ay*swY_P~6;7bEiGi~6Qsp0y=xDa*Ig&K90iRpm`1tVVX z$LrC{jJcMNr+Ol{tY)jGxm0Gv&F*v&!)~kP+nGgOC!z(as|c}sU0LDg$b~G=`;eBO zOiiQ4eB!Ap&{L6}!y4bto;mtfBUk&)X$}4Rsrjv_Pt9%D4aT3^)7S6kT?MGrn|12E zn&#PKM;Z08s2K|%kZgZ5t>1yZum%GOV5b-X{!(*dC~q@*1->e!9}HCc;d*9pWMS)Y z*ivrKN?VWVjG%?tbwr4-jNUP|QcidLVcuhT&^J%B6~9QU9RLRPE>LmPG>28U-WWf6 zQRn<%$PzFU?k$*{0XWMK)C98RbC3-?3}^VpYIboX(dX-z$MDfG0`mZ9Q{77|#^juz z+dk4NQ~u?NCh%4{!Jp+22q_i7r9&vpUVG9sW@H#2dvZ>d&{21!G)-#Z9|he-P=B zos#*{HFRX|<~mLfy6IM=bq^WfZ6=lvx;d^^C4i8wGnKBgzOX55vU`Y+EBDXA=L*}f zbE(1Eq~RKiJko&0a0TR14ni4_`$M}y>vBc?Bel13GX3N6hg(Iyif?Fi+Nc^hFkNWx@6+@I;ZYVU+>#-|_hdJQRc7X!(~mlq%ve7-}nSacIO z|11jluu&Ia5l)NifHo~$Eg3pzLtVx(UY{zN#6duQB^D6tjw!&(b);e53bzX2C;(zY zxYT{;-;7*Kx-Haa7r!1L9*tzf&XzY0EM1UgF5mm&`nIht+lw-6e6)DM3&BddAAb@U z0GhgF^PPfJfcfv-c+DzM_pZ@nZ#w`qclmAf))$8sDqr%hmXP1F(0#4Q>&ve<(}&)1 zu~@Ye>}UvtE@T=~qkcLBPW=yVmQy96tN9$`POyG^V^#fXrL< z$4M)EAuu2q{eRkf^Khv5_kX-jDo)wz#F3D4=#-_1$vOwoYRi@_6|&CAHnPk($yP!o zNwTzAMhIgaOGvf`l_j$f*~b`>!HnVie2r75bE^0E^SgfE>-)QY*Y%!1>W^8xp4~^QtJ) z=ZeI_Z7ODf8gJ7d#D@;r65HlWOVZ6Z1x1K$r*%3iYj_XJxtLS?5(852zRR6{pmUXD za#`*Ek6K4`3`LXP%~tl5kEaCbd-I13{tQc9CQs%Z@>|)>!Ab__`OB6qtDP`XS!~r8 z_tWpXv1%;Vr)Nx<9_8VmdEI7XX0Pi$JD)x0To6^hqDH8)D>{h%=UBY#1Qoe}&FRyC zfMLJyRDFx+My8jQg{2cjfU)2>o}>CYYc{QP&gi(5ZpZqmoVU{HaFiO`C~v*sy8x;+ zkUfSNE!Cv3bcXMxmDyCw`(x6c{gC&R7lWgLgj}HkVfotb#523U+FHUrjGTb}EqC;~ zN;xYas2%E>cCL6?7E1fcZxt;~ns+$R6{ypqwG+5Vl5(vW*%E=(iH#Hi`_% z2wEQ1O(U1RAD^Z`mtCL9WPK0OK=c@iH3O3&v3i{%Wd>%ZRnu;AEWN9Vux`K#NTX1e z#Rng_OPaAaK}TZ*z4;UIqkS$sT905uJO-dJ2$1F-xGo1xG;lCRzE<@)(^W0Pqa7U& zDoP!G){egWOEBS{vNt69Sm^ny3FL`#I{9@qVCEbNg`!;@tX z;_OB$up^skK@C@tZd6Ub*1AbUvX>DPEJHl)-YXua6-e1UR3W?sh1NqQZVA{l$WR$| z^>nD12BrGD`(gvaHiEm>N#X`b+upbX9?yom1tVMeV~e~cR7eKYr;dp6g-zwl8Rg2{ z7UGk2ax_b2yZXkyX^@33l++5XfI-$0ZAwA2k0zdWcV%&Uz5P+03UDGf?x;HZc#k|K znr06>8{HP)hJ4tYpyz8kU|0i#v3{CbL!iNKRf zW8L$hLdm%33gBXhn(kBkDrkrK2c)L!(Vse^Fzi9{PLe4uo{gA z4=5*VOwH>u9O|U%{u+o+a^le@)Xt|ShF!!+xy)bQN13|(U@?tEn|3hYt&Ldj0D+wK zv(2{m;{C}Vn0#92+K+ZPMM4E)yS^MMHD#{io3xUc)iwH-#t?xs}-G_4N@-G$7H5QH;x%*OqIxWMQFv&!W|fh5#kNPtDK0b2TQpG0Pjk z{Du5YfLC+0wNN6ObC~)n>9Ho?^&I}%r!RjE6%urr+!uzMKVBt&JK#EbFW8}s8n!H~ z&||9h2;*7mlyFra!FTS}O7Oy~Xo?GS)^TWX`AFG#<0Z9VC2*YbsX^l(n;AHP&Juh$ z0sHyN$l3~S2*Y=89UuXiQte)I<#3RZg|~`_Vu#Q9L~Qo2{P)4a-iae5HA0cx_uSXr zk}h6gF`Tc1>XIMq5ne)}j2(r0d5*B#BYV)u z+P5F@D}%fmV%5iF=oS5Ql|u%zyR~VxX1ZS@Cq28(4dS81PEp39bqxmXogCEnNZ9%T zR-Lm|4s#Pv(%b^xm7!uDmG~4vvGf-g4gBZ8;pn)evfmE%$e z17Ch9ce*2h^MWDCdwPW5_))u?{YI+@6%`djfYt;;1W*Jg8a!X&D3w)1siHT~`F#ep7ngfU%1rF^uuh&f4# z^Ze-8TZz$PaYd&Ouc#dh`r`?)jezamc8akUkT7{&tr!jk;prgux%Z!jQ9ZQ-vy(?7 zXPq&wsZQNJG z>boJVk&;17{d~I+olobgM+b-(sS`9pHUhNn-`Dn`XeN-fV0{w+&a;x6bnlHZF6 zSL9)DrlSE31`19&OHfK44}4>iE+o%yzJoo}7t-Ge#Vv-1d9+*e|9)yvWn__jt6o1l zhbDR)T!()fo84SAjUQ)^rML(^p0q=DAeK&}R~}w{+90R33A_HR%RFs13ZY;;zh zId~f~>&vAfO}tNxLP|4l&#w%NQYJg5m)bY>oO!RGC{`dN1HUPQS1dxu9e(AS5XM?qj}KaldLZW)FR1)Ll1x#6QkJm77Ed zpHMmQ!x0YOiJrkZs77LDegXt|#N(_$yIv_SgkiFR5x@b3PBH@)H*DB&D|!KA;CSykm*%3wF#(u_xA?3-l6kc}erxhp-)WGu7AHHvjgX1($rR*FbtDx^Xo6hl zR4;op)Tzj`cK{juw(e$IF^67gw^=asj(xGF?(s`E;%rJhZlZY+N|G)#)c|qq+~6BC zlJeupb%oj8qHhW(8_#dj?u~26B)ZOD_)Q#0%NE9sSsX*M7=KL{(!hHDm`6)8I? zgRE+u6Z7>b)T9j@Rr$$(s7_%u>4M9OmINXvIVH}JZ&JX}kXA@m)gs!G+>#{a0>ezy zu_f+bnb8Iz5H3U;6~}xpR0vC7WC1i?c3p=k3T;uM@Ce+!HZSBps?2BQ?rT>)%CEKX z-qtf5)(Q$LKQZvjU87QKGsC@9-V}Kb-_r!3)DiM~S79mO7EB%AL}PkLjjfC0T)Kb) z*<8ZlPu`w&s9?PDt5^rOgr$j`s=E?wTZ5^ZbazQ!^yo#Iwr&Q_>nxVG{oiJ1(DX+; zX1Q%D6;*>~BsZH5DpalPneRH%Q^pF+OGLDSP;_<#9bHk%nB7i<3@B3v1l^sJA=IR1 zZsScu9)}-NrmBMi1D8jyNR;j284u9Y z`k4l$1d;qy?lCx>nfO4;bMSb~hO}l5UN50^ zRyY(yAsiu+70XD&0Rs@&hZNRCslq}zh-E818d8CCEl#_zVS_NP_K9lwuwf5t&|yBT z9gtijVEeZmd~Ey2!90$!{q;{6JcdIw2IeatQ? z_@0o(O#xpJq%?r`Qw)lauvga5?6AQMM!|=fjZk^ zKo0~(8@y#neHfUM#vrP_P5!igrnn_ICz$6{8GiHF!-t1^uHB&T<7g zO5D(M;|8uaz^2N~NZ(5!+=T>gUL!`M#vlpFB|AyL=ubJFt#!M1*O58k$)*Hkka*u^ zHAr1QQIFqkgi=y3r)WkOec0N(l(l;_q;HPy9|p+gdW!eDJ=>SralZA5=;A3q2YAJS zq>Z1Fe9sssUpTwOs>yP<%mry@PTaeCse|diWSM_PuE+i{%EJ=9i3SNrG7~2tEyEez zigWcF%8{=!h#8W<{qYJ4hN9J`=qJ?AB2m#>5ptsl zmvMK~gI5VmT5BKuTA5vEyz5oTo4VLBAmQ~6D6+xaAY}w9FeC9jO_$L8wEpavw zfNuQyY8Qu>?x))&^pJ4tx~N&Jr8IhwM}^1~==5Nm(>6<1 zvda=DQ!_77)6E-_w=6WjqNq9?p-}NACWHTxB4T^!z^0nJWD9#z-zov|p(Uan%2>G( z>BSRA`noZ%(=u0wLR{1qG#z=jG~~uE^MtOVe;o&#@Ay|AfK0uxY=5<~B_!}iAe>_3 z9cyf`c8^L-!Cz=jm^jjli@*D@X2pr#A3g4M%8s1t^A0Of_o)^!87{q-vxz{brA9 zpDI1yzNh0_JqWyOl@!iAtG51v+jIP1MC9}zR-Y1G_26Oqhn%sfKXt@~7N6ClOu+9zyb}mb0Z&l83*@r?t(1&b>h9ZX<|`0Jdl2qeyb<3atU(hB74kZ@wa&P7 zL9PLynv7o-E|+3DVieTRYiBvnTHmV*ndO7|jx-<#BWx&B*}s=xNBN_CI}Ly~zqd`$ z%5nnj)BSRUX;tpziY%9y#;6i;28TC;x@Y7j*czOoG`4RJ!v`vA+9$=eitqoTIdwJox z-Z25^(mKmXpJS(;A5ad^{p!=mV;NHzWBE+6;s7{8YSj42RG)6Yii9uobzz1CK^0(8 zK&Z>}juAnEJ#qo|jOj5y=`%XQiZN}jL!$Zp>Ame^zQ_3nfUsOV(??xkHm z=|%|f;)M#h=N3U&}$+&3A9z~jgL&fkg5y_s>kw_j1X!&yAm(RPMmn7s8I_Cm(b@^ zXWi))V*he`=sj=~86!-;J$jMPP)Oq!`n;OLS*2pS9fculUt=?wclL4_q@>&j^`pBv z3L=m)j{~^#>(nIrtfrlPB8lxAjO^&rdH7yJ?DZC>25gtZZpDp#^H-%d=kY19RgMDb zo8i5%ne2U(lScN1l##>(%EKAr=<`$AA6XKBm(xYMh#4Bg94P}VZz_;&q7XC++_fcy zAQeMS%7NpCzd>>H7Xt0Rd++TFd+V=uR?PS=&I}hS5ZoS}=wi-HgANpT(0}9-BR-pZ z5Y!~HPw&9ezJqxi6Jfo@r7w41=+fsM384C_N2Fg!g8StDe7xn?&_D!VD^QrB>TCI# zd+!Sd)x8FrbZG6M#?q^P6w_wVS8X6KBQ3XdKXAwkcBy+q&pdhe=qUTsBY6DIBCy=s zBD>MR-I;w~r+R81$pZzn%a++ab}OJ@ibQ~WU;|0Kcj?S1Q5M4}{&{%?KWnw(n{yw~ zZUd2vNbAXW1-cBcY@vVxJ49AyU5^_{FIPbU(f^2PVu4_t9y^yNEV}+q4=de_r0|$4 ziUWR_G_uj4;<^RnK`-%!TjUb&d>i}m=dQOZ*;!+u&%6RN+eT zN1>*KKBld_oXfjp=^ed94!-MSUFJL+$*7#C4?6-rR5Kb;IxWFF|8VdB`Q02b{(KM@>&pI z`OcU3e!R*Yc7()wzMYWmuSa5w_8+7QV^Q;Oh{f*1h1V3{6*vmgmP~SCM!6XpBZm-X zLGP~;X5HAQ*ty-9$WBF_oNlpGKfU0j(z8A+S6|lzc@~Q{z+}cse7jc~pPb;{o(2TY z?nR?T-yPSbJbV}RIxIcANJ$Bqk<$}{8L-I%Dr04@osP(Ivt{?6)YlMa7{WdcoFKnY z2uZHgm&@sD+q}8rV;WmcWI)ZtG%~+IKa8(>=`d_&9B4SDR}b|kDoZS{_l`HGwKs_i$+&j57ei-{j_glDiy@9II@ z0sVvR@H2pp2;d+%ao_Vtoai#NFoQ=c(rJQfTq!*xo7nXdln$#zsdEF*%JizK0r@A8 zZ1M7iy)ck*3{;it2yJrgYX>7hCup?$Me)X806vrKf2gBH05^8E ze%>iZdV~Rv{HRdqJyi74JHT8S_RpgmWJ(*Eq={;l%W$6r!*7LEK+1xUY#~7ur z9MYvzQXGuXVp{e10DjA|d1Ts!Eq`jwABA0dsfS!*+6;N?=Py@NLmI6TdgSiIaLn%c zWmt7Tul>FDx9Wc{F*I1&8FHTaqb#>#nSp$pQu*>fO3_WHrhY9bVQdfLj-rjq-fnu7 z=bm3>kq3VadB>{`K96=i$jRaVhs)|l)V7+-*)}p~;A%08$@k&aHl?BFdzZCgC9Xx( zfW_qaMICnT`;7HhF0|+LO+>Vnbr>vt>MzkzNwJ^+<)

D53R~VS(kV|#Em2{0B|L3XGZ{n6u*0r+tvM&LFkcEf);sfBq zm3^TdU}pvo2;0%m(S0ciC0(V}$;&9_W#!R|0=gX?CH!g49^eRng;bIxHC&u^Qx)X( zg4{6sjz~9v@SGFI^9}$#S&&TsPXqCo+v>@G=>4BVTzYf4M@jNr1_sER65D}excr35 zPWG32wBP-RF>%jHJQ_2NPCKe|81G~!=l(mfG7U%wBd2vmZQFsI<ZuPfA4`fIyi48&f z;%*j@4T;A&tZOKf=?p{(q(OZjudOPVtay!Y34Q4-6P6By$T6V9L!rGMDNk~& zQc0<~{zilXV6yqCS}X1C3^f)t=`9_!LoPf@^NX)3KcOIVrN~YbIaJhBRt>9ymv5^< z^|cS7me`1L$ofwJXpkS2p%9B3y|pq=!t??(p#`;WmUIic+V2xPHN|BOn8U1xQ4;!*&w9TEoA=(%Ha#c$;X@7sWA_bb=4 zHxB}d8syZ}jlX-PLI-^kc(=$>Q+6u}0=k!Rb+JDK8Q1ELq*M%ynLwAh>`XHy$wlGT zydBvhrZ#&X=2f5DeDqZDQ@2ciO`vbBPTsVPqHPhkMbCB3I2V+A54W6pm0*{% zp1UN)JWaiKS1LIlBv4Sj8?)vj%%E_-^F{-TSKEN$K->lKe!f%oC!Q!T=@Ja2$B9^* z!PZ}{`JU%iqkqBy0%)SB+NjboZdapQ`Fy(U9jWNX${yYj??=fMEbXfMdm8~_M_r_v z11t8ZpL~DG-tUsC`oYPUAzsm}9i zI6{8p{=^gk=opQ7G$>!2i2N3n_{*k+r>jVoOgNlDzI&$4cw~+W=Dr8So8kM1URDF4 zQlN(7cspZ#yw>Jo5(#lTgYDr$lcBuugW-E`nv_3vi{I%JOy)&TD)s;^rDIvv>U3u? z3d^!u5YiO1cz1cXJpLCj`K$QpyJf+P-n}K)U;grEAY$l-7|BdC<-Caz-IB`p7YfE% z5wLCCfanDh#B2ZY`B7VphY*qwsLysu2ut5o+~LhMYOlCyWS2nx*>krp`m}2sC9j%1 zCmwK3RBPS`Im1ABtBy=|M2S`}QS-pPc}wPv%^GWPs*}Tb`$bDQAr5M=w(LLCqQt3zS^)iSV&788 zLA6#>!XGd}hrGoS2Rycjf}1B`GCNUCC%WQ3hXEXPnKfa;O0P zkY8H6>w@~^x~y~koTH(qvA@{EKg=9307K0CzoVuzMU1W0X}v+@%M8R5r+okxKpUzF z>=n9(MG>+}O!4YkTB+2qUepb9D&Q#pl)>EIQnT_t0V4$05pw^7fz(YwxG*4-!lpbeW6 zHvAD$YWKK=ncP^_dzJ2|3wnfyEIblD^x>h@*p9q%y}mlb*(7X;%;hn_Z>oiSg`Ao+ zvYSMGWg6>^`1CfWiVZ8}cQvU3Hhz5q7EW-v9G7y*rUTm@-5bO9HvnhkM0reStS;2T zs5V`#9#|M>Nm*WMm)Aw-{G01O}YjmaU9xQTsuonsl@#_!(qT2 z{T;#w-d$b2SWd0?9pC8(#Y`GkgZvn6P3l~B5$e1HjfQezZ3rNhG)U|;RgwDzXVM@P zEBoZa+d6*xQn4@xCcB{8`l73FbIQya1uMKB#)E9oH>2J&pJ5umQHDVHST3`GgU6Fb zKnxSOGI0pz-2kJ!QlbELCCk?9W_aUgH>**h0CjY&*)-w**h2jQBfp^ip24=)Ezdi8 z&Z9p9h{1x{A40NUzzcS&J@ax{wlhWlJPyLw{O|`Q`S7W-4?~K&E@_ddvaLJAmgjrg z9?g(PPQ2EFGsGLbUm+tu@G9S?menZGte>}iKuAzoxf7>|qhum$c2F_qu!kAL;hdS@ zw>GAl0!qkB-|8A4e%gb)nMRZF1vlwr1vG?8k13%A5QIb3TM2L)&ulzt5dKl`#Y^+i zZITEnuH0_@*3;?d)`nYSENF9#_13*tVia9Sjn%RRJDc?7!o}?@bHWcWg3fDS&h4q& z+@)~%#Zyr6dD1-j+iQkoCn|PUU5V_F%g0m(+_;%LGktM06!!P!BEjocRK=9EI#y|LIRMcHPKMa5*j6tutLuQR$ z-vY^;PLvl+QtY9QX@Y_W_`1Bv-!j@cO&_hyMiW7DOYbnyqxah}|4q7t(^&Gn(0?E6 zzq188+g zbg=h7X2p(w8?bPC+S+14H{_`ZQRKx$JCv1o@)FiOXN$1t?{2#_(a9jr{4JkU=oBp~ zA(7Y$KL26{8$0)dUxK>S{x1>5+Z2P*?GN;Qn=iQxLdxLISnSb3#nbQcH^X8udS(C>u{Bzj< zxBn>A9CcId>vepG{^&j}xH_UWYoN!55GBR@YX%Skxa81K@e^wDo#ZCa4BH;+@XdMs z!M*(CyxlU+XVfssvSCDfqnH`!o__F?+0%VzfnQtr{vA#*7h8Re5zX`YD*~Z6ulb3X zsXKb8+%xa8_z2&pU%ze}_1Zg}YrHII!0y?9#?ofBZ8zsDe+o=yolXRO!z!&VSR8Qm zzn(+fOB5SzPM|1w^0?dAOz{vUTMU$$JpG5 zlyl;|^!(G7VL<0xOgt7Byf{-qJXK@+)u4sVp0=j*X)T;hvTAtDd&jKUeCb$&%O&jw z(5T+B2?UN#0|l4rd10MhX2b4KkN4GA*!$nMnQusNG~0{6R-2cX=Lr5@&S3MlM>>2x z#Xsd57q2~k5j4{EQ+}eaY$p8qTMy5Iw(+tpihC368s0-~$0v>zYbU(+veh%cZUd-w zgt}RZw9S_BUm$%|OmgqpL6nyc|ZNw07l$ zUh-EXmBQw1G}8e=FwOp6(A-}lv5(-ozC*pqvc0^jIcVo&Crv|dw@=qgPZlQ@L2)H6T*Q1eYlaIh`%zM_kE z?u38BtMyRZcFrrnLF9Hz#+Fu%cO5!J^yd9bjn`iHJ5}{a;xjb6SBUQU38LBN$lgLH z`*E=9#WRO5eg0;XXluI1as337-}0=BTcF(P1-sl1@mNf^m2-2og==YTP40P{v!1us zM_@Yw_%%Nnmtxp~rQkX!ADHQm7`Apxm0bFyKi`D;{EJaStq)}nnyGeG3Po7Ibg}{} zJTty$&+ZhNbm7-8wV5+tar3}O#*mV6?K^SZ=*bKMKvxSffD>7V~t$KLJRJx=dJ9CRnetjQhzZJWQDnP~cC zteGR9@#oLDDBm%$vUsBSRMCC*UI}4Gy4ABIti~Dr-ed>em@gKmCCtep2z~a{G2H94 z

## Background -The open-source package [scikit-learn](https://scikit-learn.org/) provides a large variety of machine +The [scikit-learn](https://scikit-learn.org/) package provides a large variety of machine learning algorithms and data processing tools, among which is the `Pipeline` class, allowing users to prepend custom data processing steps to the machine learning model. -`MolPipeline` extends this concept to the field of chemoinformatics by -wrapping default functionalities of [RDKit](https://www.rdkit.org/), such as reading and writing SMILES strings +`MolPipeline` extends this concept to the field of cheminformatics by +wrapping standard [RDKit](https://www.rdkit.org/) functionality, such as reading and writing SMILES strings or calculating molecular descriptors from a molecule-object. -A notable difference to the `Pipeline` class of scikit-learn is that the Pipline from `MolPipeline` allows for -instances to fail during processing without interrupting the whole pipeline. -Such behaviour is useful when processing large datasets, where some SMILES strings might not encode valid molecules -or some descriptors might not be calculable for certain molecules. +MolPipeline aims to provide: +- Automated end-to-end processing from molecule data sets to deployable machine learning models. +- Scalable parallel processing and low memory usage through instance-based processing. +- Standard pipeline building blocks for flexibly building custom pipelines for various +cheminformatics tasks. +- Consistent error handling for tracking, logging, and replacing failed instances (e.g., a +SMILES string that could not be parsed correctly). +- Integrated and self-contained pipeline serialization for easy deployment and tracking +in version control. ## Publications -The publication is freely available [here](https://chemrxiv.org/engage/chemrxiv/article-details/661fec7f418a5379b00ae036). +[Sieg J, Feldmann CW, Hemmerich J, Stork C, Sandfort F, Eiden P, and Mathea M, MolPipeline: A python package for processing +molecules with RDKit in scikit-learn, J. Chem. Inf. Model., doi:10.1021/acs.jcim.4c00863, 2024](https://doi.org/10.1021/acs.jcim.4c00863) +\ +Further links: [arXiv](https://chemrxiv.org/engage/chemrxiv/article-details/661fec7f418a5379b00ae036) + +Feldmann CW, Sieg J, and Mathea M, Analysis of uncertainty of neural +fingerprint-based models, 2024 +\ +Further links: [repository](https://github.com/basf/neural-fingerprint-uncertainty) ## Installation ```commandline pip install molpipeline ``` -## Usage +## Documentation + +The [notebooks](notebooks) folder contains many basic and advanced examples of how to use Molpipeline. + +A nice introduction to the basic usage is in the [01_getting_started_with_molpipeline notebook](notebooks/01_getting_started_with_molpipeline.ipynb). -See the [notebooks](notebooks) folder for basic and advanced examples of how to use Molpipeline. +## Quick Start -A basic example of how to use MolPipeline to create a fingerprint-based model is shown below (see also the [notebook](notebooks/01_getting_started_with_molpipeline.ipynb)): +### Model building + +Create a fingerprint-based prediction model: ```python from molpipeline import Pipeline from molpipeline.any2mol import AutoToMol @@ -58,8 +79,42 @@ pipeline.predict(["CCC"]) # output: array([0.29]) ``` -Molpipeline also provides custom estimators for standard cheminformatics tasks that can be integrated into pipelines, -like clustering for scaffold splits (see also the [notebook](notebooks/02_scaffold_split_with_custom_estimators.ipynb)): +### Feature calculation + +Calculating molecular descriptors from SMILES strings is straightforward. For example, physicochemical properties can +be calculated like this: +```python +from molpipeline import Pipeline +from molpipeline.any2mol import AutoToMol +from molpipeline.mol2any import MolToRDKitPhysChem + +pipeline_physchem = Pipeline( + [ + ("auto2mol", AutoToMol()), + ( + "physchem", + MolToRDKitPhysChem( + standardizer=None, + descriptor_list=["HeavyAtomMolWt", "TPSA", "NumHAcceptors"], + ), + ), + ], + n_jobs=-1, +) +physchem_matrix = pipeline_physchem.transform(["CCCCCC", "c1ccccc1(O)"]) +physchem_matrix +# output: array([[72.066, 0. , 0. ], +# [88.065, 20.23 , 1. ]]) +``` + +MolPipeline provides further features and descriptors from RDKit, +for example Morgan (binary/count) fingerprints and MACCS keys. +See the [04_feature_calculation notebook](notebooks/04_feature_calculation.ipynb) for more examples. + +### Clustering + +Molpipeline provides several clustering algorithms as sklearn-like estimators. For example, molecules can be +clustered by their Murcko scaffold. See the [02_scaffold_split_with_custom_estimators notebook](notebooks/02_scaffold_split_with_custom_estimators.ipynb) for scaffolds splits and further examples. ```python from molpipeline.estimators import MurckoScaffoldClustering diff --git a/notebooks/04_feature_calculation.ipynb b/notebooks/04_feature_calculation.ipynb new file mode 100644 index 00000000..1bcf9ce8 --- /dev/null +++ b/notebooks/04_feature_calculation.ipynb @@ -0,0 +1,915 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a5e18566-ab97-4ead-b6e3-0ad930754a21", + "metadata": {}, + "source": [ + "# Feature calculation\n", + "\n", + "\n", + "\n", + "Molpipeline provides multiple molecular featurization methods and descriptors from RDKit. This notebook shows how features like\n", + "\n", + "- Morgan binary fingerprints\n", + "- Morgan count fingerprints\n", + "- MACCS keys fingerprints\n", + "- Physicochemical features\n", + "\n", + "can be easily calculated in parallel and in different variations with MolPipeline. If you are interested in further molecular featurization and descriptors check out the `molpipeline.mol2any` module." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6872cc5e-5851-42ec-a63e-071d8139829e", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from molpipeline import Pipeline\n", + "from molpipeline.any2mol import AutoToMol\n", + "from molpipeline.mol2any import MolToMorganFP, MolToMACCSFP, MolToRDKitPhysChem" + ] + }, + { + "cell_type": "markdown", + "id": "8a6ba6bf-c0cd-4949-82f3-e71e538cdee0", + "metadata": {}, + "source": [ + "In this example we fetch the ESOL (delaney) data set. However, you can use any other data set." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "761f0ee7-3e66-4e86-bdac-e9dcec9ecb17", + "metadata": {}, + "outputs": [], + "source": [ + "df_full = pd.read_csv(\n", + " \"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv\",\n", + " usecols=lambda col: col != \"num\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6853d13e-c371-49cc-8009-544022c67d34", + "metadata": {}, + "source": [ + "We use a smaller portion of the data set for illustration" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d47ea54e-ac15-4358-ae2b-7e8428642a26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Compound IDESOL predicted log solubility in mols per litreMinimum DegreeMolecular WeightNumber of H-Bond DonorsNumber of RingsNumber of Rotatable BondsPolar Surface Areameasured log solubility in mols per litresmiles
0Amigdalin-0.9741457.432737202.32-0.77OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1Fenfuram-2.8851201.22512242.24-3.30Cc1occc1C(=O)Nc2ccccc2
2citral-2.5791152.23700417.07-2.06CC(C)=CCCC(C)=CC(=O)
3Picene-6.6182278.3540500.00-7.87c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4Thiophene-2.232284.1430100.00-1.33c1ccsc1
.................................
95diethylstilbestrol-5.0741268.35622440.46-4.07CCC(=C(CC)c1ccc(O)cc1)c2ccc(O)cc2
96Chlorothalonil-3.9951265.91401047.58-5.64c1(C#N)c(Cl)c(C#N)c(Cl)c(Cl)c(Cl)1
972,3',4',5-PCB-6.3121291.9920210.00-7.25Clc1ccc(Cl)c(c1)c2ccc(Cl)c(Cl)c2
98styrene oxide-1.8262120.15102112.53-1.60C1OC1c2ccccc2
99Isopropylbenzene-3.2651120.1950110.00-3.27CC(C)c1ccccc1
\n", + "

100 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " Compound ID ESOL predicted log solubility in mols per litre \\\n", + "0 Amigdalin -0.974 \n", + "1 Fenfuram -2.885 \n", + "2 citral -2.579 \n", + "3 Picene -6.618 \n", + "4 Thiophene -2.232 \n", + ".. ... ... \n", + "95 diethylstilbestrol -5.074 \n", + "96 Chlorothalonil -3.995 \n", + "97 2,3',4',5-PCB -6.312 \n", + "98 styrene oxide -1.826 \n", + "99 Isopropylbenzene -3.265 \n", + "\n", + " Minimum Degree Molecular Weight Number of H-Bond Donors \\\n", + "0 1 457.432 7 \n", + "1 1 201.225 1 \n", + "2 1 152.237 0 \n", + "3 2 278.354 0 \n", + "4 2 84.143 0 \n", + ".. ... ... ... \n", + "95 1 268.356 2 \n", + "96 1 265.914 0 \n", + "97 1 291.992 0 \n", + "98 2 120.151 0 \n", + "99 1 120.195 0 \n", + "\n", + " Number of Rings Number of Rotatable Bonds Polar Surface Area \\\n", + "0 3 7 202.32 \n", + "1 2 2 42.24 \n", + "2 0 4 17.07 \n", + "3 5 0 0.00 \n", + "4 1 0 0.00 \n", + ".. ... ... ... \n", + "95 2 4 40.46 \n", + "96 1 0 47.58 \n", + "97 2 1 0.00 \n", + "98 2 1 12.53 \n", + "99 1 1 0.00 \n", + "\n", + " measured log solubility in mols per litre \\\n", + "0 -0.77 \n", + "1 -3.30 \n", + "2 -2.06 \n", + "3 -7.87 \n", + "4 -1.33 \n", + ".. ... \n", + "95 -4.07 \n", + "96 -5.64 \n", + "97 -7.25 \n", + "98 -1.60 \n", + "99 -3.27 \n", + "\n", + " smiles \n", + "0 OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)... \n", + "1 Cc1occc1C(=O)Nc2ccccc2 \n", + "2 CC(C)=CCCC(C)=CC(=O) \n", + "3 c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43 \n", + "4 c1ccsc1 \n", + ".. ... \n", + "95 CCC(=C(CC)c1ccc(O)cc1)c2ccc(O)cc2 \n", + "96 c1(C#N)c(Cl)c(C#N)c(Cl)c(Cl)c(Cl)1 \n", + "97 Clc1ccc(Cl)c(c1)c2ccc(Cl)c(Cl)c2 \n", + "98 C1OC1c2ccccc2 \n", + "99 CC(C)c1ccccc1 \n", + "\n", + "[100 rows x 10 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df_full.head(n=100)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "80d9843a-a702-4da5-8a4f-6c5ed7a5034b", + "metadata": {}, + "source": [ + "## Calculating fingerprints" + ] + }, + { + "cell_type": "markdown", + "id": "15dcb6cb-2a8e-4d62-a218-826581155816", + "metadata": {}, + "source": [ + "### Morgan binary fingerprints\n", + "\n", + "Morgan fingerprints are the most popular molecular fingerprints. They are also known as [Extended-Connectivity Fingerprints (ECFP)](https://doi.org/10.1021/ci100050t). They encode circular substructures in the molecule. The binary version contains only 0s and 1s indicating the presence or absence of the substructures in the molecule." + ] + }, + { + "cell_type": "markdown", + "id": "1a838dd7-ec21-4875-a5b8-c5e0c27d9389", + "metadata": {}, + "source": [ + "Let's define the Pipeline to first read the molecule and then calculate the binary Morgan fingerprint. Then, we execute it by calling the `transform` function." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b6be019a-cc4d-45b2-b41a-9dca98d9644c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 181 ms, sys: 247 ms, total: 428 ms\n", + "Wall time: 12.6 s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "# define the pipeline\n", + "pipeline_morgan = Pipeline(\n", + " [(\"auto2mol\", AutoToMol()), (\"morgan2_2048\", MolToMorganFP(n_bits=2048, radius=2))],\n", + " n_jobs=-1,\n", + ")\n", + "# execute the pipeline\n", + "morgan_matrix = pipeline_morgan.transform(df[\"smiles\"])\n", + "morgan_matrix" + ] + }, + { + "cell_type": "markdown", + "id": "a13cc430-1c5e-4399-ab50-4b56ce8a7c09", + "metadata": {}, + "source": [ + "By default, the `MolToMorganFP` element returns a sparse matrix. More specifically, a [csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html) is returned which is more memory efficient than a dense matrix since most elements in the matrix are zero." + ] + }, + { + "cell_type": "markdown", + "id": "d872a591-cbfe-4158-8960-da813249fd1b", + "metadata": {}, + "source": [ + "To get a dense matrix you can convert the `csr_matrix` to a dense numpy matrix like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5d9d772b-98b9-42e5-ba12-11f007a3d17f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "matrix([[0, 1, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 1, 0, ..., 0, 0, 0]])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "morgan_matrix.todense()" + ] + }, + { + "cell_type": "markdown", + "id": "923f168d-e6e4-418d-adb3-5451555b1303", + "metadata": {}, + "source": [ + "Alternatively, you can specify in the `MolToMorganFP` element the return type of the feature matrix by using the `return_as` option. You can choose between\n", + "\n", + "- `return_as=\"sparse\"` which returns a `csr_matrix`\n", + "- `return_as=\"dense` which returns a dense numpy matrix\n", + "- `return_as=\"explicit_bit_vect\"` which returns RDKit's dense [ExplicitBitVect](https://www.rdkit.org/new_docs/cppapi/classExplicitBitVect.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e728cf48-10bb-4168-9229-fe48b462ac03", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 45.4 ms, sys: 11.7 ms, total: 57 ms\n", + "Wall time: 62.4 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[0, 1, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 1, 0, ..., 0, 0, 0]], dtype=uint8)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "pipeline_morgan_dense = Pipeline(\n", + " [\n", + " (\"auto2mol\", AutoToMol()),\n", + " (\"morgan2_2048\", MolToMorganFP(n_bits=2048, radius=2, return_as=\"dense\")),\n", + " ],\n", + " n_jobs=-1,\n", + ")\n", + "dense_morgan_matrix = pipeline_morgan_dense.transform(df[\"smiles\"])\n", + "dense_morgan_matrix" + ] + }, + { + "cell_type": "markdown", + "id": "6aecd789-2198-4325-b892-6aeecf857e25", + "metadata": {}, + "source": [ + "The feature matrix can be used to train a machine learning model but also for various analyses." + ] + }, + { + "cell_type": "markdown", + "id": "85043b30-7476-4204-8268-a9375b2ee4f8", + "metadata": {}, + "source": [ + "### Morgan count fingerprints" + ] + }, + { + "cell_type": "markdown", + "id": "9897e96f-4ffd-434b-b629-837a31a99f04", + "metadata": {}, + "source": [ + "Just set `counted=True` to compute Morgan count fingerprints instead of binary fingerprints." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "477ebba4-0fbe-46c2-8c4a-13f9051ae85b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 1, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 1, 0, ..., 0, 0, 0]], dtype=uint32)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline_morgan_counted = Pipeline(\n", + " [\n", + " (\"auto2mol\", AutoToMol()),\n", + " (\n", + " \"morgan2_2048\",\n", + " MolToMorganFP(n_bits=2048, radius=2, counted=True, return_as=\"dense\"),\n", + " ),\n", + " ],\n", + " n_jobs=-1,\n", + ")\n", + "count_morgan_matrix = pipeline_morgan_counted.transform(df[\"smiles\"])\n", + "count_morgan_matrix" + ] + }, + { + "cell_type": "markdown", + "id": "0e24ea56-f0f8-4426-b3e3-da960b93d431", + "metadata": {}, + "source": [ + "When we sort the matrix values we see that some substructures are present up to 14 times in a single molecule." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "189ea2d6-9274-4097-b654-5ca88c318abf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(count_morgan_matrix.ravel(), reverse=True)[:20]" + ] + }, + { + "cell_type": "markdown", + "id": "80fb055a-1b4c-4c69-989c-5f3e774e80e1", + "metadata": {}, + "source": [ + "### MACCS key fingerprints\n", + "\n", + "MACCS keys are a manually defined set of 166 substructures whose presence is checked in the molecule. MACCS keys contain for example common functional groups." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d9a11c62-c8ad-470f-b40f-f5d4ddc16b61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 43.8 ms, sys: 1.15 ms, total: 44.9 ms\n", + "Wall time: 70.9 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[0, 0, 0, ..., 1, 1, 0],\n", + " [0, 0, 0, ..., 1, 1, 0],\n", + " [0, 0, 0, ..., 1, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 0, 1, 0],\n", + " [0, 0, 0, ..., 1, 1, 0],\n", + " [0, 0, 0, ..., 0, 1, 0]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "pipeline_maccs_dense = Pipeline(\n", + " [(\"auto2mol\", AutoToMol()), (\"maccs\", MolToMACCSFP(return_as=\"dense\"))],\n", + " n_jobs=-1,\n", + ")\n", + "dense_maccs_matrix = pipeline_maccs_dense.transform(df[\"smiles\"])\n", + "dense_maccs_matrix" + ] + }, + { + "cell_type": "markdown", + "id": "7d3546ca-6d58-4a69-a252-d7deb3147a40", + "metadata": {}, + "source": [ + "## Physicochemical features\n", + "\n", + "RDKit also provides more than 200 physicochemical descriptors that can readily be computed from most molecules. In MolPipeline we can compute these features with the `MolToRDKitPhysChem` element." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "858afb55-7e24-415d-bb5a-e0d7c811d6df", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 68.1 ms, sys: 2.43 ms, total: 70.5 ms\n", + "Wall time: 171 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[10.25332888, 10.25332888, 0.48660209, ..., 0. ,\n", + " 0. , 0. ],\n", + " [11.72491119, 11.72491119, 0.14587963, ..., 0. ,\n", + " 0. , 0. ],\n", + " [10.02049761, 10.02049761, 0.84508976, ..., 0. ,\n", + " 0. , 0. ],\n", + " ...,\n", + " [ 6.08815823, 6.08815823, 0.49556374, ..., 0. ,\n", + " 0. , 0. ],\n", + " [ 5.09453704, 5.09453704, 0.40851852, ..., 0. ,\n", + " 0. , 0. ],\n", + " [ 2.2037037 , 2.2037037 , 0.65851852, ..., 0. ,\n", + " 0. , 0. ]])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "pipeline_physchem = Pipeline(\n", + " [(\"auto2mol\", AutoToMol()), (\"physchem\", MolToRDKitPhysChem(standardizer=None))],\n", + " n_jobs=-1,\n", + ")\n", + "physchem_matrix = pipeline_physchem.transform(df[\"smiles\"])\n", + "physchem_matrix" + ] + }, + { + "cell_type": "markdown", + "id": "8746f6cb-dc30-4435-a97b-0235f2c8c47a", + "metadata": {}, + "source": [ + "We can get the name of the descriptors like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f0b5fe47-54f0-4cca-9a1a-aa689a0b2d0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['MaxAbsEStateIndex',\n", + " 'MaxEStateIndex',\n", + " 'MinAbsEStateIndex',\n", + " 'MinEStateIndex',\n", + " 'qed',\n", + " 'SPS',\n", + " 'HeavyAtomMolWt',\n", + " 'ExactMolWt',\n", + " 'NumValenceElectrons',\n", + " 'NumRadicalElectrons',\n", + " 'MaxPartialCharge',\n", + " 'MinPartialCharge',\n", + " 'MaxAbsPartialCharge',\n", + " 'MinAbsPartialCharge',\n", + " 'FpDensityMorgan1',\n", + " 'FpDensityMorgan2',\n", + " 'FpDensityMorgan3',\n", + " 'BCUT2D_MWHI',\n", + " 'BCUT2D_MWLOW',\n", + " 'BCUT2D_CHGHI']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline_physchem[\"physchem\"].descriptor_list[:20]" + ] + }, + { + "cell_type": "markdown", + "id": "b0823f4d-8a2e-4ae2-91f7-3db6ecaf0c0e", + "metadata": {}, + "source": [ + "When we only want to calculate a subset of all available descriptors we can specify this during pipeline construction" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a3e005f3-f421-4634-9135-860e91a19de1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 41.2 ms, sys: 3.38 ms, total: 44.6 ms\n", + "Wall time: 47.5 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[430.216, 202.32 , 12. ],\n", + " [190.137, 42.24 , 2. ],\n", + " [136.109, 17.07 , 1. ],\n", + " [264.242, 0. , 0. ],\n", + " [ 80.111, 0. , 1. ],\n", + " [130.151, 12.89 , 2. ],\n", + " [321.397, 0. , 0. ],\n", + " [248.196, 40.46 , 2. ],\n", + " [372.849, 12.53 , 1. ],\n", + " [372.247, 63.22 , 6. ],\n", + " [ 78.05 , 29.1 , 1. ],\n", + " [155.563, 0. , 0. ],\n", + " [ 60.055, 0. , 0. ],\n", + " [204.144, 58.2 , 2. ],\n", + " [168.154, 0. , 0. ],\n", + " [ 71.486, 0. , 0. ],\n", + " [ 76.054, 20.23 , 1. ],\n", + " [ 98.084, 23.79 , 1. ],\n", + " [283.184, 53.47 , 6. ],\n", + " [148.12 , 20.23 , 1. ],\n", + " [321.397, 0. , 0. ],\n", + " [216.155, 54.86 , 3. ],\n", + " [243.25 , 18.46 , 5. ],\n", + " [166.115, 38.33 , 2. ],\n", + " [309.139, 115.54 , 6. ],\n", + " [100.076, 20.23 , 1. ],\n", + " [172.103, 72.68 , 5. ],\n", + " [196.121, 75.27 , 3. ],\n", + " [309.966, 0. , 0. ],\n", + " [140.097, 26.3 , 2. ],\n", + " [120.11 , 0. , 0. ],\n", + " [267.272, 18.46 , 5. ],\n", + " [284.186, 76.66 , 4. ],\n", + " [ 94.928, 0. , 0. ],\n", + " [168.154, 0. , 0. ],\n", + " [ 76.054, 17.07 , 1. ],\n", + " [158.139, 12.03 , 1. ],\n", + " [234.215, 29.54 , 3. ],\n", + " [325.266, 38.77 , 5. ],\n", + " [210.981, 0. , 0. ],\n", + " [179.585, 0. , 0. ],\n", + " [ 76.054, 20.23 , 1. ],\n", + " [160.088, 75.27 , 3. ],\n", + " [136.109, 20.23 , 1. ],\n", + " [ 80.042, 26.3 , 2. ],\n", + " [100.076, 20.23 , 1. ],\n", + " [205.998, 29.1 , 1. ],\n", + " [258.034, 60.91 , 4. ],\n", + " [328.195, 107.77 , 7. ],\n", + " [146.128, 12.89 , 1. ],\n", + " [ 96.088, 0. , 0. ],\n", + " [220.143, 75.27 , 3. ],\n", + " [216.198, 0. , 0. ],\n", + " [248.015, 54.86 , 3. ],\n", + " [356.85 , 0. , 0. ],\n", + " [100.076, 20.23 , 1. ],\n", + " [108.099, 0. , 0. ],\n", + " [144.132, 0. , 0. ],\n", + " [228.209, 0. , 0. ],\n", + " [ 76.054, 17.07 , 1. ],\n", + " [427.756, 0. , 0. ],\n", + " [104.064, 26.3 , 2. ],\n", + " [367.223, 115.06 , 6. ],\n", + " [102.072, 46.25 , 2. ],\n", + " [248.157, 90.06 , 5. ],\n", + " [347.692, 54.37 , 3. ],\n", + " [213.587, 53.94 , 5. ],\n", + " [118.075, 68.87 , 3. ],\n", + " [223.993, 72.19 , 2. ],\n", + " [215.038, 0. , 0. ],\n", + " [232.111, 118.05 , 6. ],\n", + " [277.042, 52.37 , 3. ],\n", + " [136.109, 17.07 , 1. ],\n", + " [232.154, 75.27 , 3. ],\n", + " [116.075, 26.3 , 2. ],\n", + " [116.075, 26.3 , 2. ],\n", + " [356.252, 75.71 , 4. ],\n", + " [250.491, 0. , 0. ],\n", + " [115.937, 0. , 0. ],\n", + " [325.09 , 49.17 , 5. ],\n", + " [245.177, 55.84 , 6. ],\n", + " [140.105, 51.56 , 4. ],\n", + " [ 72.092, 52.04 , 1. ],\n", + " [ 96.088, 0. , 0. ],\n", + " [120.11 , 0. , 0. ],\n", + " [236.74 , 0. , 0. ],\n", + " [428.285, 68.55 , 5. ],\n", + " [ 82.038, 43.14 , 2. ],\n", + " [136.109, 17.07 , 1. ],\n", + " [261.627, 45.23 , 3. ],\n", + " [188.977, 43.14 , 2. ],\n", + " [236.211, 58.2 , 3. ],\n", + " [192.176, 0. , 0. ],\n", + " [ 88.065, 9.23 , 1. ],\n", + " [144.132, 0. , 0. ],\n", + " [248.196, 40.46 , 2. ],\n", + " [265.914, 47.58 , 2. ],\n", + " [285.944, 0. , 0. ],\n", + " [112.087, 12.53 , 1. ],\n", + " [108.099, 0. , 0. ]])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "pipeline_physchem_small = Pipeline(\n", + " [\n", + " (\"auto2mol\", AutoToMol()),\n", + " (\n", + " \"physchem\",\n", + " MolToRDKitPhysChem(\n", + " standardizer=None,\n", + " descriptor_list=[\"HeavyAtomMolWt\", \"TPSA\", \"NumHAcceptors\"],\n", + " ),\n", + " ),\n", + " ],\n", + " n_jobs=-1,\n", + ")\n", + "physchem_matrix_small = pipeline_physchem_small.transform(df[\"smiles\"])\n", + "physchem_matrix_small" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ccb5d59aef8b39f2b79d15ef8dddbc5e266639ad Mon Sep 17 00:00:00 2001 From: Jochen Sieg Date: Fri, 20 Sep 2024 14:59:59 +0200 Subject: [PATCH 6/6] add counted fps to test --- .../test_mol2morgan_fingerprint.py | 26 ++++++++++++------- tests/utils/fingerprints.py | 4 ++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index 14ff2282..6fae46b4 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -10,6 +10,7 @@ from molpipeline import Pipeline from molpipeline.any2mol import SmilesToMol from molpipeline.mol2any import MolToMorganFP +from tests.utils.fingerprints import fingerprints_to_numpy test_smiles = [ "c1ccccc1", @@ -129,11 +130,15 @@ def test_setter_getter_error_handling(self) -> None: self.assertRaises(ValueError, mol_fp.set_params, **params) def test_bit2atom_mapping(self) -> None: - """Test that the mapping from bits to atom weights works as intended.""" - # lower n_bit values, e.g. 2048, will lead to a bit clash during folding, - # for the test smiles "NCCOCCCC(=O)O". - # We want no folding clashes in this test to check the correct length - # of the bit-to-atom mapping. + """Test that the mapping from bits to atom weights works as intended. + + Notes + ----- + lower n_bit values, e.g. 2048, will lead to a bit clash during folding, + for the test smiles "NCCOCCCC(=O)O". + We want no folding clashes in this test to check the correct length + of the bit-to-atom mapping. + """ n_bits = 2100 sparse_morgan = MolToMorganFP(radius=2, n_bits=n_bits, return_as="sparse") dense_morgan = MolToMorganFP(radius=2, n_bits=n_bits, return_as="dense") @@ -144,10 +149,13 @@ def test_bit2atom_mapping(self) -> None: smi2mol = SmilesToMol() for test_smi in test_smiles: for fp_gen in [sparse_morgan, dense_morgan, explicit_bit_vect_morgan]: - mol = smi2mol.transform([test_smi])[0] - fp = fp_gen.transform([mol]) - mapping = fp_gen.bit2atom_mapping(mol) - self.assertEqual(np.sum(fp), len(mapping)) # type: ignore + for counted in [False, True]: + mol = smi2mol.transform([test_smi])[0] + fp_gen.set_params(counted=counted) + fp = fp_gen.transform([mol]) + mapping = fp_gen.bit2atom_mapping(mol) + np_fp = fingerprints_to_numpy(fp) + self.assertEqual(np.nonzero(np_fp)[0].shape[0], len(mapping)) # type: ignore if __name__ == "__main__": diff --git a/tests/utils/fingerprints.py b/tests/utils/fingerprints.py index 5973d004..1ca392a4 100644 --- a/tests/utils/fingerprints.py +++ b/tests/utils/fingerprints.py @@ -8,7 +8,7 @@ # pylint: disable=no-name-in-module from rdkit.Chem import rdFingerprintGenerator as rdkit_fp -from rdkit.DataStructs import ExplicitBitVect +from rdkit.DataStructs import ExplicitBitVect, UIntSparseIntVect from scipy import sparse @@ -59,6 +59,8 @@ def fingerprints_to_numpy( """ if all(isinstance(fp, ExplicitBitVect) for fp in fingerprints): return np.array(fingerprints) + if all(isinstance(fp, UIntSparseIntVect) for fp in fingerprints): + return np.array([fp.ToList() for fp in fingerprints]) if isinstance(fingerprints, sparse.csr_matrix): return fingerprints.toarray() if isinstance(fingerprints, np.ndarray):