diff --git a/docs/advanced_examples/KNearestNeighbors.ipynb b/docs/advanced_examples/KNearestNeighbors.ipynb index 081a2496c..fa6d88837 100644 --- a/docs/advanced_examples/KNearestNeighbors.ipynb +++ b/docs/advanced_examples/KNearestNeighbors.ipynb @@ -12,7 +12,7 @@ "\n", "In classification, KNN aims to identify the nearest points by measuring their similarity, often through distance metrics. The new labels are then assigned through majority voting, considering the most frequent labels among the neighboring points.\n", "\n", - "In Fully Homomorphic Encryption (FHE), classification with KNN poses significant computational challenges due to the distance calculations and the sorting algorithms, which is currently a non-stable algorithm.\n", + "In Fully Homomorphic Encryption (FHE), classification with KNN poses significant computational challenges due to the distance calculations and the sorting algorithms, which is currently a non-stable algorithm (i.e., does not consider the order of the elements).\n", "\n", "It is therefore recommended to use it on small datasets (up to dozens of examples) with strong quantization (n_bits <= 4).\n", "\n", @@ -25,7 +25,7 @@ "source": [ "### Import libraries\n", "\n", - "First, import the required packages, the classical KNN regressor and its Concrete ML counterpart." + "First, import the required packages." ] }, { @@ -38,6 +38,7 @@ "\n", "import pandas as pd\n", "from sklearn.datasets import make_classification\n", + "from sklearn.metrics import accuracy_score\n", "from sklearn.model_selection import train_test_split\n", "\n", "from concrete.ml.sklearn import KNeighborsClassifier as ConcreteKNeighborsClassifier" @@ -57,7 +58,7 @@ "outputs": [], "source": [ "X, y = make_classification(\n", - " n_samples=20, n_features=20, n_informative=3, n_redundant=0, n_classes=2, n_clusters_per_class=1\n", + " n_samples=20, n_features=5, n_informative=3, n_redundant=0, n_classes=2, n_clusters_per_class=1\n", ")\n", "# Split the data-set into a train and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)" @@ -67,7 +68,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Model instantiation" + "# Model instantiation\n", + "\n", + "The novel aspect introduced by Concret ML models is the hyperparameters:\n", + "- `n_bits`: which represents the precision for quantizing input data. This quantization step is essential after the training phase, since FHE exclusively operates over integers.\n", + "- `rounding_threshold_bits`: TODO" ] }, { @@ -76,16 +81,13 @@ "metadata": {}, "outputs": [], "source": [ - "# The novel aspect introduced by Concret-ML models is the hyperparameter `n_bits`, which represents\n", - "# the precision for quantizing input data\n", - "# This quantization step is essential after the training phase, since FHE exclusively operates\n", - "# over integers\n", - "\n", "n_neighbors = 3\n", "\n", - "concrete_knn = ConcreteKNeighborsClassifier(n_bits=3, n_neighbors=n_neighbors)\n", + "concrete_knn = ConcreteKNeighborsClassifier(\n", + " n_bits=2, n_neighbors=n_neighbors, rounding_threshold_bits=4\n", + ")\n", "\n", - "# Fit both the Concrete ML and its equivalent float estimators on clear data\n", + "# Fit both the Concrete ML and its equivalent float estimator on clear data\n", "concrete_knn, sklearn_model = concrete_knn.fit_benchmark(X_train, y_train)" ] }, @@ -93,7 +95,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Compile the model" + "# Compile the model\n", + "\n", + "\n", + "The compilation step aims to:\n", + "- convert the quantized model to its FHE equivalent\n", + "- create an executable operation graph\n", + "- check the operation graph's compatibility with FHE\n", + "- compute the maximum bit-width needed for model execution\n", + "- determine cryptographic parameters necessary for generating secret keys and evaluation keys" ] }, { @@ -105,18 +115,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Saved in 47\n", - "Compilation time: 2.56 seconds\n" + "Compilation time: 4.22 seconds\n" ] } ], "source": [ - "# The compilation step aims to:\n", - "# - convert the quantized model to its FHE equivalent\n", - "# - create an executable operation graph\n", - "# - check the operation graph's compatibility with FHE\n", - "# - compute the maximum bit-width needed for model execution\n", - "# - determine cryptographic parameters necessary for generating secret keys and evaluation keys\n", "time_begin = time.time()\n", "circuit = concrete_knn.compile(X)\n", "print(f\"Compilation time: {time.time() - time_begin:.2f} seconds\")" @@ -131,19 +134,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "Generating a key for an 8-bit circuit\n" + "Generating a key for an 6-bits circuit\n" ] } ], "source": [ - "print(f\"Generating a key for an {circuit.graph.maximum_integer_bit_width()}-bit circuit\")" + "print(f\"Generating a key for an {circuit.graph.maximum_integer_bit_width()}-bits circuit\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Key generation" + "# Key generation\n", + "\n", + "The circuit generated by the compiler is used to generate a set of keys:\n", + " - a _Secret key_ , held exclusively by the user and used for both encryption and decryption process\n", + "\n", + "- an _Evaluation Key_, publicly accessible without compromising the security of the scheme, and used to evaluate the circuit on encrypted data" ] }, { @@ -155,14 +163,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Key generation time: 829.81 seconds\n" + "Key generation time: 40.04 seconds\n" ] } ], "source": [ "# Note that this step may be time-consuming for circuits exceeding 8-bits\n", "time_begin = time.time()\n", - "circuit.client.keygen(force=False)\n", + "circuit.client.keygen()\n", "print(f\"Key generation time: {time.time() - time_begin:.2f} seconds\")" ] }, @@ -172,11 +180,11 @@ "source": [ "# Inference with Concrete ML:\n", "\n", - "a. __clear__: inference on unencrypted quantized data, without any FHE execution \n", + "a. __clear__: inference on non-encrypted quantized data, without any FHE execution \n", "\n", - "b. __Execution in FHE__: inference on encrypted data, using actual FHE execution\n", + "b. __Simulation__: inference on non-encrypted quantized data, while simulating all FHE operations, failure probabilities and crypto-parameters. This mode of inference is recommended in the deployment phase. For further information, please consult [this link](https://docs.zama.ai/concrete-ml/advanced-topics/compilation#fhe-simulation)\n", "\n", - "c. __Simulation__: inference on unencrypted quantized data, without secure FHE execution, while simulating the p_error failure probability. For further information, please consult: [TODO]()" + "c. __Execution in FHE__: inference on encrypted data, using actual FHE execution" ] }, { @@ -187,36 +195,22 @@ "source": [ "# scikit-learn inference\n", "predict_sklearn = sklearn_model.predict(X_test)\n", - "score_sklearn = (predict_sklearn == y_test).mean()" + "score_sklearn = accuracy_score(y_test, predict_sklearn)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Time inference: 40.05 seconds per sample\n" - ] - } - ], + "outputs": [], "source": [ "# a- Clear inference\n", "pred_cml_clear = concrete_knn.predict(X_test, fhe=\"disable\")\n", - "score_cml_clear = (pred_cml_clear == y_test).mean()\n", + "score_cml_clear = accuracy_score(y_test, pred_cml_clear)\n", "\n", - "# b- FHE inference\n", - "time_begin = time.time()\n", - "pred_cml_fhe = concrete_knn.predict(X_test[0, None], fhe=\"execute\")\n", - "print(f\"Time inference: {time.time() - time_begin:.2f} seconds per sample\")\n", - "score_cml_fhe = (pred_cml_fhe == y_test[0]).mean()\n", - "\n", - "# c- FHE simulation inference\n", + "# b- FHE simulation inference\n", "pred_cml_simulate = concrete_knn.predict(X_test, fhe=\"simulate\")\n", - "score_cml_simulate = (pred_cml_simulate == y_test).mean()" + "score_cml_simulate = accuracy_score(y_test, pred_cml_simulate)" ] }, { @@ -230,155 +224,164 @@ "text": [ "sckit-learn score: 70.00%\n", "Concrete ML (clear) score: 80.00%\n", - "Concrete ML FHE (simulation) score: 80.00%\n", - "Concrete ML FHE score: 100.00%\n" + "Concrete ML (FHE simulation) score: 80.00%\n" ] } ], "source": [ "print(f\"sckit-learn score: {score_sklearn:.2%}\")\n", "print(f\"Concrete ML (clear) score: {score_cml_clear:.2%}\")\n", - "print(f\"Concrete ML FHE (simulation) score: {score_cml_simulate:.2%}\")\n", - "print(f\"Concrete ML FHE score: {score_cml_fhe:.2%}\")" + "print(f\"Concrete ML (FHE simulation) score: {score_cml_simulate:.2%}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Concrete KNN vs. scikit-learn KNN" + "### Concrete KNN vs. scikit-learn KNN\n", + "\n", + "Let's compare the top-k labels returned by Concrete and scikit-learn's KNN in the table below, highlighting mismatched predictions." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "distance, topk_args = sklearn_model.kneighbors(X_test)\n", "\n", "topk_sk = y_train[topk_args]\n", - "topk_cml = concrete_knn.topk" + "topk_cml = concrete_knn._topk_labels" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 DistanceTop3 (scikit-learn)Majority vote (scikit-learn)Top3 (Concrete ML)Majority vote (Concrete ML)DistanceTop3 (scikit-learn)Majority vote (scikit-learn)Top3 (Concrete ML)Majority vote (Concrete ML)
04.620949[0, 0, 0]0[0, 0, 0]002.041796[1, 0, 0]0[1, 0, 0]0
15.215420[1, 0, 1]1[1, 0, 1]112.514646[0, 0, 0]0[0, 0, 0]0
23.655300[0, 0, 0]0[0, 0, 0]022.037168[1, 1, 0]1[1, 1, 1]1
35.601465[1, 0, 0]0[1, 0, 1]131.800107[1, 1, 1]1[1, 1, 1]1
44.655596[1, 1, 0]1[1, 1, 0]141.380009[1, 0, 1]1[0, 0, 1]0
53.393518[0, 1, 0]0[0, 1, 0]051.078951[0, 0, 1]0[0, 0, 1]0
65.437388[1, 1, 1]1[1, 1, 0]161.093890[1, 1, 0]1[1, 1, 1]1
74.737523[1, 1, 0]1[0, 1, 1]172.419766[1, 1, 1]1[1, 1, 1]1
85.163767[1, 1, 0]1[1, 1, 1]182.809947[0, 0, 0]0[0, 0, 0]0
94.256865[1, 1, 0]1[1, 1, 0]191.553386[1, 1, 1]1[1, 1, 0]1
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 29, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "def highlight_diff(row):\n", + " \"\"\"Custom style function to highlight distinct rows.\"\"\"\n", + " return [\n", + " \"background-color: yellow\"\n", + " if row[\"Majority vote (Concrete ML)\"] != row[\"Majority vote (scikit-learn)\"]\n", + " else \"\"\n", + " ] * len(row)\n", + "\n", + "\n", "df = pd.DataFrame(\n", " {\n", " \"Distance\": distance[:, 0],\n", @@ -389,14 +392,6 @@ " }\n", ")\n", "\n", - "# Custom style function to highlight distinct rows\n", - "def highlight_diff(row):\n", - " return [\n", - " \"background-color: yellow\"\n", - " if row[\"Majority vote (Concrete ML)\"] != row[\"Majority vote (scikit-learn)\"]\n", - " else \"\"\n", - " ] * len(row)\n", - "\n", "df.style.apply(highlight_diff, axis=1)" ] }, @@ -404,26 +399,83 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Conclusion\n", - "\n", - "Due to its notable time and memory complexity, the K-nearest neighbors classifier is currently not well-suited for real-world applications, especially those involving large data-sets.\n", + "The difference in the top-k labels presented in the table above can be linked to two factors: the quantization of distances and the fact that the sorting algorithm is not stableof the sorting algorithm." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inference in FHE\n", "\n", - "Performance-wise, the K-nearest neighbors classifier matches pretty good its scikit-learn equivalent. The loss in accuracy is caused by the quantization and to the fact that the sorting algorithm is non-stable." + "Now, that the model is compiled, prediction in real FHE can be carried out." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FHE inference execution time: 4.64 seconds per sample\n" + ] + } + ], + "source": [ + "# c- FHE inference\n", + "time_begin = time.time()\n", + "pred_cml_fhe = concrete_knn.predict(X_test, fhe=\"execute\")\n", + "print(\n", + " f\"FHE inference execution time:{(time.time() - time_begin) / len(X_test):.2f}s per sample\"\n", + ")\n", + "score_cml_fhe = accuracy_score(y_test, pred_cml_fhe)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Concrete ML FHE score: 80.00%\n" + ] + } + ], + "source": [ + "print(f\"Concrete ML FHE score: {score_cml_fhe:.2%}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predictions in FHE can be time-consuming due to the costly FHE operations. During the development phase, using the simulation mode offers an efficient and quick way to gauge the model's performance. Especially as it produces similar results." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Future work\n", + "# Conclusion\n", "\n", - "In future releases of **Concrete ML** we will improve the K-nearest neighbors classifier to consider more features and larger data-sets." + "For real-world applications, especially those involving with large data-sets, the K-nearest neighbors classifier may face limitations due to its notable time and memory complexity. TODO: add rounding_threshold_bits trick\n", + "\n", + "The K-nearest neighbors classifier matches its scikit-learn equivalent. The small loss in accuracy comes from quantization artifacts as well as the fact that the sorting algorithm is non-stable." ] } ], "metadata": { "execution": { "timeout": 10800 + }, + "language_info": { + "name": "python" } }, "nbformat": 4, diff --git a/src/concrete/ml/sklearn/base.py b/src/concrete/ml/sklearn/base.py index 2a8dbe8af..c4d390825 100644 --- a/src/concrete/ml/sklearn/base.py +++ b/src/concrete/ml/sklearn/base.py @@ -21,8 +21,6 @@ import skorch.net import torch from brevitas.export.onnx.qonnx.manager import QONNXManager as BrevitasONNXManager -from concrete.fhe import array as fhe_array -from concrete.fhe import zeros as fhe_zeros from concrete.fhe.compilation.artifacts import DebugArtifacts from concrete.fhe.compilation.circuit import Circuit from concrete.fhe.compilation.compiler import Compiler @@ -347,6 +345,9 @@ def get_sklearn_params(self, deep: bool = True) -> dict: # Remove the n_bits parameters as this attribute is added by Concrete ML params.pop("n_bits", None) + # Remove the rounding_threshold_bits parameters as this attribute is added by Concrete ML + params.pop("rounding_threshold_bits", None) + return params def _set_post_processing_params(self) -> None: @@ -1761,12 +1762,13 @@ def __init_subclass__(cls): _NEIGHBORS_MODELS.add(cls) _ALL_SKLEARN_MODELS.add(cls) - def __init__(self, n_bits: int = 3): + def __init__(self, n_bits: int = 3, rounding_threshold_bits: int = 7): """Initialize the FHE knn model. Args: - n_bits (int): Number of bits to quantize the model. IThe value will be used for + n_bits (int): Number of bits to quantize the model. The value will be used for quantizing inputs and X_fit. Default to 3. + rounding_threshold_bits (int): Number of bits to keep, Default to 6. """ self.n_bits: int = n_bits # _q_fit_X: In distance metric algorithms, `_q_fit_X` stores the training set to compute @@ -1777,6 +1779,11 @@ def __init__(self, n_bits: int = 3): self._y: numpy.ndarray # _q_fit_X_quantizer: The quantizer to use for quantizing the model's training set self._q_fit_X_quantizer: Optional[UniformQuantizer] = None + # Topk labels + self._topk_labels: numpy.ndarray + # Number of bits to keep + self.rounding_threshold_bits = rounding_threshold_bits + self.rounder = cnp.AutoRounder(target_msbs=rounding_threshold_bits) BaseEstimator.__init__(self) @@ -1966,7 +1973,7 @@ def gather1d(x, indices): arr = [] for i in indices: arr.append(x[i]) - enc_arr = fhe_array(arr) + enc_arr = cnp.array(arr) return enc_arr def scatter1d(x, v, indices): @@ -1987,7 +1994,7 @@ def scatter1d(x, v, indices): return x comparisons = numpy.zeros(x.shape) - labels = labels + fhe_zeros(labels.shape) + labels = labels + cnp.zeros(labels.shape) n, k = x.size, self.n_neighbors ln2n = int(numpy.ceil(numpy.log2(n))) @@ -2025,8 +2032,8 @@ def scatter1d(x, v, indices): with cnp.tag("max"): # Select max(a, b) - diff = a - b - max_x = a + numpy.maximum(0, b - a) + diff = b - a + max_x = a + numpy.maximum(0, diff) # Swap if a > b # x[range_i] = max_x(a, b): First bitonic sequence gets min(a, b) @@ -2034,9 +2041,9 @@ def scatter1d(x, v, indices): # x[range_i + d] = min(a, b): Second bitonic sequence gets max(a, b) x = scatter1d(x, max_x, range_i + d) - with cnp.tag("sign"): - # Max index selection - is_a_greater_than_b = diff <= 0 + with cnp.tag("Rounded_sign"): + diff = cnp.round_bit_pattern(diff, lsbs_to_remove=self.rounder) + is_a_greater_than_b = diff >= 0 # Update labels array according to the max items with cnp.tag("label_swap"): @@ -2047,14 +2054,20 @@ def scatter1d(x, v, indices): labels = scatter1d(labels, max_labels, range_i + d) # Update - comparisons[range_i + d] = comparisons[range_i + d] + 1 - d = q - p - r = p + with cnp.tag("update_indices"): + comparisons[range_i + d] = comparisons[range_i + d] + 1 + d = q - p + r = p return labels[0 : self.n_neighbors] # 1. Pairwise_euclidiean distance - distance_matrix = pairwise_euclidean_distance(q_X) + with cnp.tag("Original distance"): + distance_matrix = pairwise_euclidean_distance(q_X) + + # Reduce the bit-width precion to get smaller accumulators + with cnp.tag("Rounded distance"): + distance_matrix = cnp.round_bit_pattern(distance_matrix, lsbs_to_remove=self.rounder) # The square root in the Euclidean distance calculation is not applied to speed up FHE # computations. @@ -2065,6 +2078,25 @@ def scatter1d(x, v, indices): return numpy.expand_dims(topk_labels, axis=0) def compile(self, *args, **kwargs) -> Circuit: + def force_auto_adjust_rounder_in_configuration(configuration): + if configuration is None: + configuration = Configuration(auto_adjust_rounders=True, **kwargs) + else: + configuration.auto_adjust_rounders = True + return configuration + + # If a configuration instance is given as a positional parameter, set auto_adjust_rounder + if len(args) >= 2: + configuration = force_auto_adjust_rounder_in_configuration(args[1]) + args_list = list(args) + args_list[1] = configuration + args = tuple(args_list) + + # Else, retrieve the configuration in kwargs if it exists, or create a new one, and set the + # auto_adjust_rounder + else: + configuration = kwargs.get("configuration", None) + kwargs["configuration"] = force_auto_adjust_rounder_in_configuration(configuration) return BaseEstimator.compile(self, *args, **kwargs) def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray: @@ -2080,7 +2112,7 @@ def post_processing(self, y_preds: numpy.ndarray) -> numpy.ndarray: numpy.ndarray: The majority vote. """ - self.topk = y_preds.squeeze() + self._topk_labels = y_preds.squeeze() y_preds_processed = [] for y in y_preds: diff --git a/src/concrete/ml/sklearn/neighbors.py b/src/concrete/ml/sklearn/neighbors.py index 368c9690b..3bc98a87a 100644 --- a/src/concrete/ml/sklearn/neighbors.py +++ b/src/concrete/ml/sklearn/neighbors.py @@ -15,6 +15,7 @@ class KNeighborsClassifier(SklearnKNeighborsClassifierMixin): Parameters: n_bits (int): Number of bits to quantize the model. The value will be used for quantizing inputs and X_fit. Default to 3. + rounding_threshold_bits (int): Number of bits to keep, Default to 6. For more details on KNeighborsClassifier please refer to the scikit-learn documentation: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html @@ -27,6 +28,7 @@ def __init__( self, n_bits=2, n_neighbors=3, + rounding_threshold_bits=6, *, weights="uniform", algorithm="auto", @@ -59,6 +61,7 @@ def __init__( self.metric_params = metric_params self.n_jobs = n_jobs self.weights = weights + self.rounding_threshold_bits = rounding_threshold_bits def dump_dict(self) -> Dict[str, Any]: assert self._q_fit_X_quantizer is not None, self._is_not_fitted_error_message()