zama-ai · RomanBredehoft · Apr 10, 2024 · Apr 3, 2024 · Apr 4, 2024 · Apr 9, 2024
@@ -13,6 +13,7 @@
 from concrete.fhe.compilation import Circuit, Configuration
 from concrete.fhe.mlir.utils import MAXIMUM_TLU_BIT_WIDTH
 from sklearn.datasets import make_classification, make_regression
+from sklearn.metrics import accuracy_score
 
 from concrete.ml.common.utils import (
     SUPPORTED_FLOAT_TYPES,
@@ -420,7 +421,7 @@ def check_accuracy():
     """Fixture to check the accuracy."""
 
     def check_accuracy_impl(expected, actual, threshold=0.9):
-        accuracy = numpy.mean(expected == actual)
+        accuracy = accuracy_score(expected, actual)
         assert accuracy >= threshold, f"Accuracy of {accuracy} is not high enough ({threshold})."
 
     return check_accuracy_impl

@@ -694,6 +694,8 @@ def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray:
         Returns:
             numpy.ndarray: The post-processed predictions.
         """
+        assert isinstance(y_preds, numpy.ndarray), "Output predictions must be an array."
+
         return y_preds
 
 
@@ -805,8 +807,10 @@ def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray:
 
             # If the prediction array is 1D, transform the output into a 2D array [1-p, p],
             # with p the initial output probabilities
+            # This is similar to what is done in scikit-learn
             if y_preds.ndim == 1 or y_preds.shape[1] == 1:
-                y_preds = numpy.concatenate((1 - y_preds, y_preds), axis=1)
+                y_preds = y_preds.flatten()
+                return numpy.vstack([1 - y_preds, y_preds]).T
 
         # Else, apply the softmax operator
         else:
@@ -1387,8 +1391,13 @@ def quantize_input(self, X: numpy.ndarray) -> numpy.ndarray:
     def dequantize_output(self, q_y_preds: numpy.ndarray) -> numpy.ndarray:
         self.check_model_is_fitted()
 
-        q_y_preds = self.output_quantizers[0].dequant(q_y_preds)
-        return q_y_preds
+        y_preds = self.output_quantizers[0].dequant(q_y_preds)
+
+        # If the preds have shape (n, 1), squeeze it to shape (n,) like in scikit-learn
+        if y_preds.ndim == 2 and y_preds.shape[1] == 1:
+            return y_preds.ravel()
+
+        return y_preds
 
     def _get_module_to_compile(self) -> Union[Compiler, QuantizedModule]:
         assert self._tree_inference is not None, self._is_not_fitted_error_message()
@@ -1442,7 +1451,12 @@ def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray:
         if not self._fhe_ensembling:
             y_preds = numpy.sum(y_preds, axis=-1)
 
-            assert_true(y_preds.ndim == 2, "y_preds should be a 2D array")
+            assert isinstance(y_preds, numpy.ndarray), "Output predictions must be an array."
+
+            # If the preds have shape (n, 1), squeeze it to shape (n,) like in scikit-learn
+            if y_preds.ndim == 2 and y_preds.shape[1] == 1:
+                return y_preds.ravel()
+
             return y_preds
 
         return super().post_processing(y_preds)
@@ -1693,6 +1707,10 @@ def dequantize_output(self, q_y_preds: numpy.ndarray) -> numpy.ndarray:
         # De-quantize the output values
         y_preds = self.output_quantizers[0].dequant(q_y_preds)
 
+        # If the preds have shape (n, 1), squeeze it to shape (n,) like in scikit-learn
+        if y_preds.ndim == 2 and y_preds.shape[1] == 1:
+            return y_preds.ravel()
+
         return y_preds
 
     def _get_module_to_compile(self) -> Union[Compiler, QuantizedModule]:
@@ -1735,38 +1753,6 @@ class SklearnLinearRegressorMixin(SklearnLinearModelMixin, sklearn.base.Regresso
     """
 
 
-class SklearnSGDRegressorMixin(SklearnLinearRegressorMixin):
-    """A Mixin class for sklearn SGD regressors with FHE.
-
-    This class is used to create a SGD regressor class what can be exported
-    to ONNX using Hummingbird.
-    """
-
-    # Remove once Hummingbird supports SGDRegressor
-    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4100
-    def _set_onnx_model(self, test_input: numpy.ndarray) -> None:
-        """Retrieve the model's ONNX graph using Hummingbird conversion.
-
-        Args:
-            test_input (numpy.ndarray): An input data used to trace the model execution.
-        """
-        # Check that the underlying sklearn model has been set and fit
-        assert self.sklearn_model is not None, self._sklearn_model_is_not_fitted_error_message()
-
-        model_for_onnx = LinearRegression()
-        model_for_onnx.coef_ = self.sklearn_model.coef_
-        model_for_onnx.intercept_ = self.sklearn_model.intercept_
-
-        self.onnx_model_ = hb_convert(
-            model_for_onnx,
-            backend="onnx",
-            test_input=test_input,
-            extra_config={"onnx_target_opset": OPSET_VERSION_FOR_ONNX_EXPORT},
-        ).model
-
-        self._clean_graph()
-
-
 class SklearnLinearClassifierMixin(
     BaseClassifier, SklearnLinearModelMixin, sklearn.base.ClassifierMixin, ABC
 ):
@@ -1807,14 +1793,61 @@ def decision_function(
         """
         # Here, we want to use SklearnLinearModelMixin's `predict` method as confidence scores are
         # the dot product's output values, without any post-processing
-        y_preds = SklearnLinearModelMixin.predict(self, X, fhe=fhe)
-        return y_preds
+        y_scores = SklearnLinearModelMixin.predict(self, X, fhe=fhe)
+
+        return y_scores
 
     def predict_proba(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.ndarray:
-        y_logits = self.decision_function(X, fhe=fhe)
-        y_proba = self.post_processing(y_logits)
+        y_scores = self.decision_function(X, fhe=fhe)
+        y_proba = self.post_processing(y_scores)
         return y_proba
 
+    # In scikit-learn, the argmax is done on the scores directly, not the probabilities
+    def predict(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.ndarray:
+        # Compute the predicted scores
+        y_scores = self.decision_function(X, fhe=fhe)
+
+        # Retrieve the class with the highest score
+        # If there is a single dimension, only compare the scores to 0
+        if y_scores.ndim == 1:
+            y_preds = (y_scores > 0).astype(int)
+        else:
+            y_preds = numpy.argmax(y_scores, axis=1)
+
+        return self.classes_[y_preds]
+
+
+class SklearnSGDRegressorMixin(SklearnLinearRegressorMixin):
+    """A Mixin class for sklearn SGD regressors with FHE.
+
+    This class is used to create a SGD regressor class what can be exported
+    to ONNX using Hummingbird.
+    """
+
+    # Remove once Hummingbird supports SGDRegressor
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4100
+    def _set_onnx_model(self, test_input: numpy.ndarray) -> None:
+        """Retrieve the model's ONNX graph using Hummingbird conversion.
+
+        Args:
+            test_input (numpy.ndarray): An input data used to trace the model execution.
+        """
+        # Check that the underlying sklearn model has been set and fit
+        assert self.sklearn_model is not None, self._sklearn_model_is_not_fitted_error_message()
+
+        model_for_onnx = LinearRegression()
+        model_for_onnx.coef_ = self.sklearn_model.coef_
+        model_for_onnx.intercept_ = self.sklearn_model.intercept_
+
+        self.onnx_model_ = hb_convert(
+            model_for_onnx,
+            backend="onnx",
+            test_input=test_input,
+            extra_config={"onnx_target_opset": OPSET_VERSION_FOR_ONNX_EXPORT},
+        ).model
+
+        self._clean_graph()
+
 
 class SklearnSGDClassifierMixin(SklearnLinearClassifierMixin):
     """A Mixin class for sklearn SGD classifiers with FHE.
@@ -1823,7 +1856,7 @@ class SklearnSGDClassifierMixin(SklearnLinearClassifierMixin):
     to ONNX using Hummingbird.
     """
 
-    # Remove once Hummingbird supports SGDRegressor
+    # Remove once Hummingbird supports SGDClassifier
     # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4100
     def _set_onnx_model(self, test_input: numpy.ndarray) -> None:
         """Retrieve the model's ONNX graph using Hummingbird conversion.
@@ -1988,6 +2021,11 @@ def dequantize_output(self, q_y_preds: numpy.ndarray) -> numpy.ndarray:
         self.check_model_is_fitted()
         # We compute the sorted argmax in FHE, which are integers.
         # No need to de-quantize the output values
+
+        # If the preds have shape (n, 1), squeeze it to shape (n,) like in scikit-learn
+        if q_y_preds.ndim > 1 and q_y_preds.shape[1] == 1:
+            return q_y_preds.ravel()
+
         return q_y_preds
 
     def _get_module_to_compile(self) -> Union[Compiler, QuantizedModule]:

@@ -9,7 +9,6 @@
 from sklearn.linear_model import SGDClassifier as SklearnSGDClassifier
 from sklearn.preprocessing import LabelEncoder
 
-from ..common.check_inputs import check_array_and_assert
 from ..common.utils import FheMode
 from ..onnx.ops_impl import numpy_sigmoid
 from ..quantization import QuantizedModule
@@ -253,27 +252,12 @@ def __init__(
                     "Setting 'parameter_range' is mandatory if FHE training is enabled "
                     f"({fit_encrypted=}). Got {parameters_range=}"
                 )
-
-    def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray:
-        # If the prediction array is 1D, which happens with some models such as XGBCLassifier or
-        # LogisticRegression models, we have a binary classification problem
-        n_classes = y_preds.shape[1] if y_preds.ndim > 1 and y_preds.shape[1] > 1 else 2
-
-        # For binary classification problem, apply the sigmoid operator
-        if n_classes == 2:
-            y_preds = numpy_sigmoid(y_preds)[0]
-
-            # If the prediction array is 1D, transform the output into a 2D array [1-p, p],
-            # with p the initial output probabilities
-            if y_preds.ndim == 1 or y_preds.shape[1] == 1:
-                y_preds = numpy.concatenate((1 - y_preds, y_preds), axis=1)
-
-        # Else, apply the softmax operator
         else:
-            y_preds = numpy_sigmoid(y_preds)[0]
-            y_preds = y_preds / y_preds.sum(axis=1)
-
-        return y_preds
+            supported_losses = ["log_loss", "modified_huber"]
+            if self.loss not in supported_losses:
+                raise NotImplementedError(
+                    f"Only one of {supported_losses} loss is supported. Got {self.loss}."
+                )
 
     def get_sklearn_params(self, deep: bool = True) -> dict:
         # Here, the `get_params` method is the `BaseEstimator.get_params` method from scikit-learn
@@ -835,61 +819,24 @@ def partial_fit(
             # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4184
             raise NotImplementedError("Partial fit is not currently supported for clear training.")
 
-    # This method is taken directly from scikit-learn
-    def _predict_proba_lr(self, X: Data, fhe: Union[FheMode, str]) -> numpy.ndarray:
-        """Probability estimation for OvR logistic regression.
-
-        Positive class probabilities are computed as
-        1. / (1. + np.exp(-self.decision_function(X)));
-        multiclass is handled by normalizing that over all classes.
-
-        Args:
-            X (Data): The input values to predict, as a Numpy array, Torch tensor, Pandas DataFrame
-                or List. It mush have a shape of (n_samples, n_features).
-            fhe (Union[FheMode, str]): The mode to use for prediction.
-                Can be FheMode.DISABLE for Concrete ML Python inference,
-                FheMode.SIMULATE for FHE simulation and FheMode.EXECUTE for actual FHE execution.
-                Can also be the string representation of any of these values.
-
-        Returns:
-            numpy.ndarray: The predicted class probabilities.
-        """
-        prob = self.decision_function(X, fhe=fhe)
-        prob = numpy_sigmoid(prob)[0]
-
-        assert isinstance(prob, numpy.ndarray)
-
-        if prob.shape[1] == 1:
-            prob = prob.flatten()
-            return numpy.vstack([1 - prob, prob]).T
-
-        # OvR normalization, like LibLinear's predict_probability
-        prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
-
-        return prob
-
-    def predict_proba(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.ndarray:
-        """Probability estimates.
+    def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray:
+        """Apply post-processing to the de-quantized predictions.
 
-        This method is only available for log loss and modified Huber loss.
-        Multiclass probability estimates are derived from binary (one-vs.-rest)
-        estimates by simple normalization, as recommended by Zadrozny and Elkan.
+        This is called at the end of the `predict_proba` method and is only available for log loss
+        and modified Huber losses. Multiclass probability estimates are derived from binary
+        (one-vs.-rest) estimates by simple normalization, as recommended by Zadrozny and Elkan.
 
         Binary probability estimates for loss="modified_huber" are given by
         (clip(decision_function(X), -1, 1) + 1) / 2. For other loss functions
         it is necessary to perform proper probability calibration by wrapping
         the classifier with `sklearn.calibration.CalibratedClassifierCV` instead.
 
         Args:
-            X (Data): The input values to predict, as a Numpy array, Torch tensor, Pandas DataFrame
-                or List. It mush have a shape of (n_samples, n_features).
-            fhe (Union[FheMode, str]): The mode to use for prediction.
-                Can be FheMode.DISABLE for Concrete ML Python inference,
-                FheMode.SIMULATE for FHE simulation and FheMode.EXECUTE for actual FHE execution.
-                Can also be the string representation of any of these values.
+            y_preds (Data): The de-quantized predictions to post-process. It mush have a shape of
+                (n_samples, n_features).
 
         Returns:
-            numpy.ndarray: The predicted class probabilities, with shape (n_samples, n_classes).
+            numpy.ndarray: The post-processed predictions, with shape (n_samples, n_classes).
 
         Raises:
             NotImplementedError: If the given loss is not supported.
@@ -903,52 +850,61 @@ def predict_proba(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) ->
             case is in the appendix B in:
             http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf
         """
-        X = check_array_and_assert(X)
-
+        # The following lines are taken directly from scikit-learn's source code
         if self.loss == "log_loss":
-            return self._predict_proba_lr(X, fhe=fhe)
+            y_preds = numpy_sigmoid(y_preds)[0]
+
+            assert isinstance(y_preds, numpy.ndarray)
+
+            if y_preds.ndim == 1 or y_preds.shape[1] == 1:
+                y_preds = y_preds.flatten()
+                return numpy.vstack([1 - y_preds, y_preds]).T
 
-        if self.loss == "modified_huber":
+            # OvR normalization, like LibLinear's predict_probability
+            prob = y_preds / y_preds.sum(axis=1).reshape((y_preds.shape[0], -1))
+
+        # The following lines are taken directly from scikit-learn's source code
+        elif self.loss == "modified_huber":
             assert isinstance(self.classes_, numpy.ndarray)
             binary = len(self.classes_) == 2
-            scores = self.decision_function(X)
-
-            if binary:
-                scores = scores[:, 0]
 
             prob2 = numpy.empty(tuple())
             if binary:
-                prob2 = numpy.ones((scores.shape[0], 2))
+                prob2 = numpy.ones((y_preds.shape[0], 2))
                 prob = prob2[:, 1]
+
             else:
-                prob = scores
+                prob = y_preds
 
-            numpy.clip(scores, -1, 1, prob)
+            numpy.clip(y_preds, -1, 1, prob)
             prob += 1.0
             prob /= 2.0
 
             if binary:
                 prob2[:, 0] -= prob
                 prob = prob2
+
             else:
                 # the above might assign zero to all classes, which doesn't
                 # normalize neatly; work around this to produce uniform
                 # probabilities
                 prob_sum = prob.sum(axis=1)
                 all_zero = prob_sum == 0
+
                 if numpy.any(all_zero):  # pragma: no cover
                     prob[all_zero, :] = 1
                     prob_sum[all_zero] = len(self.classes_)
 
                 # normalize
                 prob /= prob_sum.reshape((prob.shape[0], -1))
 
-            return prob
+        else:  # pragma: no cover
+            supported_losses = ["log_loss", "modified_huber"]
+            raise NotImplementedError(
+                f"Only one of {supported_losses} loss is supported. Got {self.loss}."
+            )
 
-        raise NotImplementedError(
-            f"Method 'predict_proba' currently only supports one of"
-            f" ['log_loss', 'modifier_huber'] loss. Got {self.loss}."
-        )
+        return prob
 
     def dump_dict(self) -> Dict[str, Any]:
         assert self._weight_quantizer is not None, self._is_not_fitted_error_message()