basf · JenniferHem · Sep 5, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,5 @@ __pycache__
 molpipeline.egg-info/
 lib/
 build/
+lightning_logs/
 
diff --git a/molpipeline/estimators/chemprop/models.py b/molpipeline/estimators/chemprop/models.py
@@ -112,7 +112,7 @@ def _is_multiclass_classifier(self) -> bool:
         bool
             True if the model is a multiclass classifier, False otherwise.
         """
-        if isinstance(self.model.predictor, MulticlassClassificationFFN):
+        if isinstance(self.model.predictor, MulticlassClassificationFFN) and self.n_classes > 2:
             return True
         return False
 
@@ -348,3 +348,118 @@ def __init__(
             n_jobs=n_jobs,
             **kwargs,
         )
+
+
+class ChempropMulticlassClassifier(ChempropModel):
+    """Chemprop model with default parameters for multiclass classification tasks."""
+
+    def __init__(
+        self,
+        n_classes: int,
+        model: MPNN | None = None,
+        lightning_trainer: pl.Trainer | None = None,
+        batch_size: int = 64,
+        n_jobs: int = 1,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the chemprop multiclass model.
+
+        Parameters
+        ----------
+        n_classes : int
+            The number of classes for the classifier.
+        model : MPNN | None, optional
+            The chemprop model to wrap. If None, a default model will be used.
+        lightning_trainer : pl.Trainer, optional
+            The lightning trainer to use, by default None
+        batch_size : int, optional (default=64)
+            The batch size to use.
+        n_jobs : int, optional (default=1)
+            The number of jobs to use.
+        kwargs : Any
+            Parameters set using `set_params`.
+            Can be used to modify components of the model.
+        """
+        if model is None:
+            bond_encoder = BondMessagePassing()
+            agg = SumAggregation()
+            predictor = MulticlassClassificationFFN(n_classes=n_classes)
+            model = MPNN(message_passing=bond_encoder, agg=agg, predictor=predictor)
+        super().__init__(
+            model=model,
+            lightning_trainer=lightning_trainer,
+            batch_size=batch_size,
+            n_jobs=n_jobs,
+            **kwargs,
+        )
+        self.n_classes = n_classes
+
+    def set_params(self, **params: Any) -> Self:
+        """Set the parameters of the model and check if it is a multiclass classifier.
+
+        Parameters
+        ----------
+        **params
+            The parameters to set.
+
+        Returns
+        -------
+        Self
+            The model with the new parameters.
+        """
+        super().set_params(**params)
+        if not self._is_multiclass_classifier():
+            raise ValueError(
+                "ChempropMulticlassClassifier should contain more than 2 classes."
+            )
+        return self
+
+    def fit(
+        self,
+        X: MoleculeDataset,
+        y: Sequence[int | float] | npt.NDArray[np.int_ | np.float64],
+    ) -> Self:
+        """Fit the model to the data.
+
+        Parameters
+        ----------
+        X : MoleculeDataset
+            The input data.
+        y : Sequence[int | float] | npt.NDArray[np.int_ | np.float64]
+            The target data.
+
+        Returns
+        -------
+        Self
+            The fitted model.
+        """
+        self._check_correct_input(y)
+        return super().fit(X, y)
+
+    def _check_correct_input(
+        self, y: Sequence[int | float] | npt.NDArray[np.int_ | np.float64]
+    ) -> None:
+        """Check if the input for the multi-class classifier is correct.
+
+        Parameters
+        ----------
+        y : _type_
+            Indended classes for the dataset
+
+        Raises
+        ------
+        ValueError
+            if the classes found in y are not matching n_classes or if the class labels do not start from 0 to n_classes-1
+        """
+        unique_y = np.unique(y)
+        log = []
+        if self.n_classes != len(unique_y):
+            log.append(
+                f"Given number of classes in init (n_classes) does not match the number of unique classes (found {unique_y}) in the target data."
+            )
+        if sorted(unique_y) != list(range(self.n_classes)):
+            err = f"Classes need to be in the range from 0 to {self.n_classes-1}. Found {unique_y}. Please correct the input data accordingly."
+            print(err)
+            log.append(err)
+        if log:
+            raise ValueError("\n".join(log))
diff --git a/test_extras/test_chemprop/test_chemprop_pipeline.py b/test_extras/test_chemprop/test_chemprop_pipeline.py
@@ -23,6 +23,7 @@
     ChempropClassifier,
     ChempropModel,
     ChempropRegressor,
+    ChempropMulticlassClassifier,
 )
 from molpipeline.mol2any.mol2chemprop import MolToChemprop
 from molpipeline.pipeline import Pipeline
@@ -139,6 +140,40 @@ def get_classification_pipeline() -> Pipeline:
     return model_pipeline
 
 
+def get_multiclass_classification_pipeline(n_classes: int) -> Pipeline:
+    """Get the Chemprop model pipeline for classification.
+
+    Parameters
+    ----------
+    n_classes : int
+        The number of classes for model initialization.
+
+    Returns
+    -------
+    Pipeline
+        The Chemprop model pipeline for classification.
+    """
+    smiles2mol = SmilesToMol()
+    mol2chemprop = MolToChemprop()
+    error_filter = ErrorFilter(filter_everything=True)
+    filter_reinserter = FilterReinserter.from_error_filter(
+        error_filter, fill_value=np.nan
+    )
+    chemprop_model = ChempropMulticlassClassifier(
+        n_classes=n_classes, lightning_trainer=DEFAULT_TRAINER
+    )
+    model_pipeline = Pipeline(
+        steps=[
+            ("smiles2mol", smiles2mol),
+            ("mol2chemprop", mol2chemprop),
+            ("error_filter", error_filter),
+            ("model", chemprop_model),
+            ("filter_reinserter", PostPredictionWrapper(filter_reinserter)),
+        ],
+    )
+    return model_pipeline
+
+
 _T = TypeVar("_T")
 
 
@@ -282,7 +317,6 @@ def test_prediction(self) -> None:
         molecule_net_bbbp_df = pd.read_csv(
             TEST_DATA_DIR / "molecule_net_bbbp.tsv.gz", sep="\t", nrows=100
         )
-        molecule_net_bbbp_df.to_csv("molecule_net_bbbp.tsv.gz", sep="\t", index=False)
         classification_model = get_classification_pipeline()
         classification_model.fit(
             molecule_net_bbbp_df["smiles"].tolist(),
@@ -306,3 +340,50 @@ def test_prediction(self) -> None:
 
         self.assertEqual(proba.shape, proba_copy.shape)
         self.assertTrue(np.allclose(proba[~nan_indices], proba_copy[~nan_indices]))
+
+
+class TestMulticlassClassificationPipeline(unittest.TestCase):
+    """Test the Chemprop model pipeline for classification."""
+
+    def test_prediction(self) -> None:
+        """Test the prediction of the classification model."""
+
+        test_data_df = pd.read_csv(
+            TEST_DATA_DIR / "multiclass_mock.tsv", sep="\t", index_col=False
+        )
+        print(test_data_df.head())
+        print(test_data_df.columns)
+        classification_model = get_multiclass_classification_pipeline(n_classes=3)
+        mols = test_data_df["Molecule"].tolist()
+        classification_model.fit(
+            mols,
+            test_data_df["Label"].to_numpy(),
+        )
+        pred = classification_model.predict(mols)
+        proba = classification_model.predict_proba(mols)
+        self.assertEqual(len(pred), len(test_data_df))
+        self.assertEqual(proba.shape[1], 3)
+        self.assertEqual(proba.shape[0], len(test_data_df))
+
+        model_copy = joblib_dump_load(classification_model)
+        pred_copy = model_copy.predict(mols)
+        proba_copy = model_copy.predict_proba(mols)
+
+        nan_indices = np.isnan(pred)
+        self.assertListEqual(nan_indices.tolist(), np.isnan(pred_copy).tolist())
+        self.assertTrue(np.allclose(pred[~nan_indices], pred_copy[~nan_indices]))
+
+        self.assertEqual(proba.shape, proba_copy.shape)
+        self.assertTrue(np.allclose(proba[~nan_indices], proba_copy[~nan_indices]))
+
+        with self.assertRaises(ValueError):
+            classification_model.fit(
+                mols,
+                test_data_df["Label"].add(1).to_numpy(),
+            )
+        with self.assertRaises(ValueError):
+            classification_model = get_multiclass_classification_pipeline(n_classes=2)
+            classification_model.fit(
+                mols,
+                test_data_df["Label"].to_numpy(),
+            )
diff --git a/tests/test_data/multiclass_mock.tsv b/tests/test_data/multiclass_mock.tsv
@@ -0,0 +1,13 @@
+Molecule	Label
+"CCCCCC"	0
+"CCCCCCCO"	1
+"CCCC"	0
+"CCCN"	2
+"CCCCCC"	0
+"CCCO"	1
+"CCCCC"	0
+"CCCCCN"	2
+"CC(C)CCC"	0
+"CCCCCCO"	1
+"CCCCCl"	0
+"CCC#N"	2