From 14d3132f023dcb558dc92c0ace72273144bde37d Mon Sep 17 00:00:00 2001 From: Samir Nasibli Date: Wed, 26 Jul 2023 13:35:10 +0200 Subject: [PATCH] MAINT: align `daal4py`, `onedal` and `sklearnex` modules with isort and black (#1375) * apply isort and black fixes for daal4py, onedal and sklearnex modules --- daal4py/__init__.py | 33 +- daal4py/__main__.py | 28 +- daal4py/mb/__init__.py | 6 +- daal4py/mb/model_builders.py | 109 +- daal4py/oneapi/__init__.py | 41 +- daal4py/sklearn/__init__.py | 30 +- daal4py/sklearn/_device_offload.py | 42 +- daal4py/sklearn/_utils.py | 80 +- daal4py/sklearn/cluster/__init__.py | 9 +- daal4py/sklearn/cluster/_dbscan.py | 62 +- daal4py/sklearn/cluster/_k_means_0_22.py | 317 +++-- daal4py/sklearn/cluster/_k_means_0_23.py | 353 ++--- daal4py/sklearn/cluster/dbscan.py | 4 +- daal4py/sklearn/cluster/k_means.py | 6 +- daal4py/sklearn/cluster/tests/test_dbscan.py | 81 +- daal4py/sklearn/decomposition/__init__.py | 6 +- daal4py/sklearn/decomposition/_pca.py | 259 ++-- .../sklearn/ensemble/AdaBoostClassifier.py | 120 +- daal4py/sklearn/ensemble/GBTDAAL.py | 157 ++- daal4py/sklearn/ensemble/__init__.py | 17 +- daal4py/sklearn/ensemble/_forest.py | 4 +- daal4py/sklearn/linear_model/__init__.py | 21 +- .../linear_model/_coordinate_descent.py | 440 ++++--- daal4py/sklearn/linear_model/_linear.py | 196 +-- daal4py/sklearn/linear_model/_ridge.py | 175 ++- .../linear_model/coordinate_descent.py | 4 +- daal4py/sklearn/linear_model/linear.py | 4 +- daal4py/sklearn/linear_model/logistic_loss.py | 86 +- daal4py/sklearn/linear_model/logistic_path.py | 558 ++++---- daal4py/sklearn/linear_model/ridge.py | 4 +- .../sklearn/linear_model/tests/test_linear.py | 52 +- daal4py/sklearn/manifold/__init__.py | 6 +- daal4py/sklearn/manifold/_t_sne.py | 304 +++-- daal4py/sklearn/metrics/__init__.py | 8 +- daal4py/sklearn/metrics/_pairwise.py | 119 +- daal4py/sklearn/metrics/_ranking.py | 156 ++- daal4py/sklearn/model_selection/__init__.py | 6 +- daal4py/sklearn/model_selection/_split.py | 247 ++-- .../model_selection/tests/test_split.py | 34 +- daal4py/sklearn/monkeypatch/dispatcher.py | 203 +-- .../sklearn/monkeypatch/tests/_models_info.py | 163 +-- .../monkeypatch/tests/test_monkeypatch.py | 21 +- .../monkeypatch/tests/test_patching.py | 44 +- .../tests/utils/_launch_algorithms.py | 78 +- daal4py/sklearn/neighbors/__init__.py | 8 +- daal4py/sklearn/neighbors/_base.py | 336 +++-- daal4py/sklearn/neighbors/_classification.py | 213 ++- daal4py/sklearn/neighbors/_regression.py | 121 +- daal4py/sklearn/neighbors/_unsupervised.py | 57 +- .../neighbors/tests/test_kneighbors.py | 82 +- daal4py/sklearn/svm/__init__.py | 6 +- daal4py/sklearn/svm/_svm_0_22.py | 385 +++--- daal4py/sklearn/svm/_svm_0_23.py | 386 +++--- daal4py/sklearn/svm/svm.py | 8 +- daal4py/sklearn/test/test_common.py | 52 +- daal4py/sklearn/tree/__init__.py | 6 +- daal4py/sklearn/tree/decision_tree.py | 110 +- daal4py/sklearn/utils/__init__.py | 12 +- daal4py/sklearn/utils/base.py | 13 +- daal4py/sklearn/utils/validation.py | 304 +++-- onedal/__init__.py | 35 +- onedal/_device_offload.py | 29 +- onedal/basic_statistics/__init__.py | 6 +- onedal/basic_statistics/basic_statistics.py | 52 +- .../tests/test_basic_statistics.py | 34 +- onedal/cluster/__init__.py | 11 +- onedal/cluster/kmeans.py | 121 +- onedal/cluster/kmeans_init.py | 57 +- onedal/cluster/tests/test_kmeans.py | 31 +- onedal/cluster/tests/test_kmeans_init.py | 28 +- onedal/common/_estimator_checks.py | 17 +- onedal/common/_mixin.py | 12 +- onedal/common/_policy.py | 19 +- onedal/common/_spmd_policy.py | 8 +- onedal/common/tests/test_policy.py | 35 +- onedal/datatypes/__init__.py | 12 +- onedal/datatypes/_data_conversion.py | 18 +- onedal/datatypes/tests/test_data.py | 74 +- onedal/decomposition/__init__.py | 6 +- onedal/decomposition/pca.py | 59 +- onedal/ensemble/__init__.py | 18 +- onedal/ensemble/forest.py | 659 +++++----- onedal/ensemble/tests/test_random_forest.py | 74 +- onedal/linear_model/__init__.py | 6 +- onedal/linear_model/linear_model.py | 64 +- .../tests/test_linear_regression.py | 38 +- onedal/neighbors/__init__.py | 6 +- onedal/neighbors/neighbors.py | 385 +++--- .../tests/test_knn_classification.py | 14 +- onedal/primitives/__init__.py | 15 +- onedal/primitives/get_tree.py | 3 +- onedal/primitives/kernel_functions.py | 45 +- .../primitives/tests/test_kernel_functions.py | 55 +- onedal/spmd/__init__.py | 17 +- onedal/spmd/basic_statistics/__init__.py | 6 +- .../spmd/basic_statistics/basic_statistics.py | 11 +- onedal/spmd/cluster/__init__.py | 8 +- onedal/spmd/cluster/kmeans.py | 10 +- onedal/spmd/decomposition/__init__.py | 6 +- onedal/spmd/decomposition/pca.py | 10 +- onedal/spmd/ensemble/__init__.py | 6 +- onedal/spmd/ensemble/forest.py | 8 +- onedal/spmd/linear_model/__init__.py | 6 +- onedal/spmd/linear_model/linear_model.py | 11 +- onedal/spmd/neighbors/__init__.py | 6 +- onedal/spmd/neighbors/neighbors.py | 31 +- onedal/svm/__init__.py | 6 +- onedal/svm/svm.py | 427 +++++-- onedal/svm/tests/test_csr_svm.py | 287 ++++- onedal/svm/tests/test_nusvc.py | 97 +- onedal/svm/tests/test_nusvr.py | 101 +- onedal/svm/tests/test_svc.py | 124 +- onedal/svm/tests/test_svr.py | 136 +- onedal/tests/utils/_device_selection.py | 35 +- onedal/utils/__init__.py | 38 +- onedal/utils/validation.py | 218 ++-- sklearnex/__main__.py | 28 +- sklearnex/_config.py | 4 +- sklearnex/_device_offload.py | 116 +- sklearnex/_utils.py | 48 +- sklearnex/basic_statistics/__init__.py | 6 +- .../basic_statistics/basic_statistics.py | 4 +- sklearnex/cluster/__init__.py | 8 +- sklearnex/cluster/dbscan.py | 4 +- sklearnex/cluster/k_means.py | 4 +- sklearnex/cluster/tests/test_dbscan.py | 10 +- sklearnex/cluster/tests/test_kmeans.py | 10 +- sklearnex/decomposition/__init__.py | 6 +- sklearnex/decomposition/pca.py | 4 +- sklearnex/decomposition/tests/test_pca.py | 9 +- sklearnex/dispatcher.py | 52 +- sklearnex/ensemble/__init__.py | 6 +- sklearnex/ensemble/forest.py | 4 +- sklearnex/ensemble/tests/test_forest.py | 32 +- sklearnex/glob/__main__.py | 47 +- sklearnex/glob/dispatcher.py | 35 +- sklearnex/linear_model/__init__.py | 20 +- sklearnex/linear_model/coordinate_descent.py | 4 +- sklearnex/linear_model/logistic_path.py | 6 +- sklearnex/linear_model/ridge.py | 4 +- sklearnex/linear_model/tests/test_linear.py | 17 +- sklearnex/linear_model/tests/test_logreg.py | 7 +- sklearnex/manifold/__init__.py | 6 +- sklearnex/manifold/t_sne.py | 4 +- sklearnex/manifold/tests/test_tsne.py | 7 +- sklearnex/metrics/__init__.py | 10 +- sklearnex/metrics/pairwise.py | 4 +- sklearnex/metrics/ranking.py | 4 +- sklearnex/metrics/tests/test_metrics.py | 14 +- sklearnex/model_selection/__init__.py | 6 +- sklearnex/model_selection/split.py | 4 +- .../tests/test_model_selection.py | 7 +- sklearnex/neighbors/__init__.py | 14 +- sklearnex/neighbors/common.py | 159 ++- sklearnex/neighbors/knn_classification.py | 287 +++-- sklearnex/neighbors/knn_regression.py | 270 ++-- sklearnex/neighbors/knn_unsupervised.py | 174 ++- sklearnex/neighbors/lof.py | 161 ++- sklearnex/neighbors/tests/test_neighbors.py | 20 +- sklearnex/preview/__init__.py | 6 +- sklearnex/preview/cluster/__init__.py | 6 +- sklearnex/preview/cluster/_common.py | 8 +- sklearnex/preview/cluster/k_means.py | 163 ++- sklearnex/preview/decomposition/__init__.py | 6 +- sklearnex/preview/decomposition/pca.py | 203 +-- .../decomposition/tests/test_preview_pca.py | 11 +- sklearnex/preview/ensemble/__init__.py | 10 +- sklearnex/preview/ensemble/extra_trees.py | 1137 ++++++++++------- sklearnex/preview/ensemble/forest.py | 851 ++++++------ .../ensemble/tests/test_preview_ensemble.py | 47 +- sklearnex/preview/linear_model/__init__.py | 8 +- sklearnex/preview/linear_model/_common.py | 10 +- sklearnex/preview/linear_model/linear.py | 189 +-- .../linear_model/tests/test_preview_linear.py | 14 +- sklearnex/spmd/__init__.py | 17 +- sklearnex/spmd/basic_statistics/__init__.py | 6 +- .../spmd/basic_statistics/basic_statistics.py | 4 +- sklearnex/spmd/cluster/__init__.py | 8 +- sklearnex/spmd/cluster/kmeans.py | 4 +- sklearnex/spmd/decomposition/__init__.py | 6 +- sklearnex/spmd/decomposition/pca.py | 4 +- sklearnex/spmd/ensemble/__init__.py | 6 +- sklearnex/spmd/ensemble/forest.py | 34 +- sklearnex/spmd/linear_model/__init__.py | 6 +- sklearnex/spmd/linear_model/linear_model.py | 4 +- sklearnex/spmd/neighbors/__init__.py | 6 +- sklearnex/spmd/neighbors/neighbors.py | 6 +- sklearnex/svm/__init__.py | 19 +- sklearnex/svm/_common.py | 77 +- sklearnex/svm/nusvc.py | 165 ++- sklearnex/svm/nusvr.py | 96 +- sklearnex/svm/svc.py | 212 +-- sklearnex/svm/svr.py | 96 +- sklearnex/svm/tests/test_svm.py | 40 +- sklearnex/tests/_models_info.py | 177 +-- sklearnex/tests/test_config.py | 17 +- sklearnex/tests/test_memory_usage.py | 126 +- sklearnex/tests/test_monkeypatch.py | 105 +- sklearnex/tests/test_patching.py | 11 +- .../tests/test_run_to_run_stability_tests.py | 425 +++--- sklearnex/tests/utils/_launch_algorithms.py | 78 +- sklearnex/utils/__init__.py | 6 +- sklearnex/utils/validation.py | 4 +- 203 files changed, 9594 insertions(+), 6929 deletions(-) diff --git a/daal4py/__init__.py b/daal4py/__init__.py index f3b7181c27..19c4cb0410 100644 --- a/daal4py/__init__.py +++ b/daal4py/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,44 +13,47 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import platform + if "Windows" in platform.system(): import os - import sys import site + import sys + current_path = os.path.dirname(__file__) path_to_env = site.getsitepackages()[0] path_to_libs = os.path.join(path_to_env, "Library", "bin") path_to_oneapi_backend = os.path.join(current_path, "oneapi") if sys.version_info.minor >= 8: - if 'DALROOT' in os.environ: - dal_root_redist = os.path.join(os.environ['DALROOT'], "redist", "intel64") + if "DALROOT" in os.environ: + dal_root_redist = os.path.join(os.environ["DALROOT"], "redist", "intel64") if os.path.exists(dal_root_redist): os.add_dll_directory(dal_root_redist) - os.environ['PATH'] = dal_root_redist + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = dal_root_redist + os.pathsep + os.environ["PATH"] os.add_dll_directory(path_to_libs) os.add_dll_directory(path_to_oneapi_backend) - os.environ['PATH'] = path_to_libs + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = path_to_libs + os.pathsep + os.environ["PATH"] try: from daal4py._daal4py import * from daal4py._daal4py import ( - _get__version__, + __has_dist__, _get__daal_link_version__, _get__daal_run_version__, - __has_dist__) + _get__version__, + ) except ImportError as e: s = str(e) - if 'libfabric' in s: + if "libfabric" in s: raise ImportError( - s + '\n\nActivating your conda environment or sourcing mpivars.' - '[c]sh/psxevars.[c]sh may solve the issue.\n') + s + "\n\nActivating your conda environment or sourcing mpivars." + "[c]sh/psxevars.[c]sh may solve the issue.\n" + ) raise -from . import mb -from . import sklearn +from . import mb, sklearn -__all__ = ['mb', 'sklearn'] +__all__ = ["mb", "sklearn"] diff --git a/daal4py/__main__.py b/daal4py/__main__.py index 9b3a27a43d..3ded7d1e10 100644 --- a/daal4py/__main__.py +++ b/daal4py/__main__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sys + from .sklearn import patch_sklearn @@ -29,27 +30,30 @@ def _main(): Python* patches of scikit-learn, optimizing solvers of scikit-learn with Intel(R) oneAPI Data Analytics Library. """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('-m', action='store_true', dest='module', - help="Executes following as a module") - parser.add_argument('name', help="Script or module name") - parser.add_argument('args', nargs=argparse.REMAINDER, - help="Command line arguments") + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "-m", action="store_true", dest="module", help="Executes following as a module" + ) + parser.add_argument("name", help="Script or module name") + parser.add_argument("args", nargs=argparse.REMAINDER, help="Command line arguments") args = parser.parse_args() try: import sklearn + patch_sklearn() except ImportError: print("Scikit-learn could not be imported. Nothing to patch") sys.argv = [args.name] + args.args - if '_' + args.name in globals(): - return globals()['_' + args.name](*args.args) + if "_" + args.name in globals(): + return globals()["_" + args.name](*args.args) import runpy + runf = runpy.run_module if args.module else runpy.run_path - runf(args.name, run_name='__main__') + runf(args.name, run_name="__main__") sys.exit(_main()) diff --git a/daal4py/mb/__init__.py b/daal4py/mb/__init__.py index 279681ca07..5478f23b0b 100644 --- a/daal4py/mb/__init__.py +++ b/daal4py/mb/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .model_builders import GBTDAALBaseModel, convert_model -__all__ = ['GBTDAALBaseModel', 'convert_model'] +__all__ = ["GBTDAALBaseModel", "convert_model"] diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index c3f2a99be7..aafbc0be47 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py Model builders API import numpy as np + import daal4py as d4p try: from pandas import DataFrame from pandas.core.dtypes.cast import find_common_type + pandas_is_imported = True except (ImportError, ModuleNotFoundError): pandas_is_imported = False @@ -41,7 +43,7 @@ def getFPType(X): dt = find_common_type(X.dtypes.tolist()) return parse_dtype(dt) - dt = getattr(X, 'dtype', None) + dt = getattr(X, "dtype", None) return parse_dtype(dt) @@ -65,9 +67,9 @@ def _get_params_from_xgboost(self, params): self.n_features_in_ = int(params["learner"]["learner_model_param"]["num_feature"]) def _get_params_from_catboost(self, params): - if 'class_params' in params['model_info']: - self.n_classes_ = len(params['model_info']['class_params']['class_to_label']) - self.n_features_in_ = len(params['features_info']['float_features']) + if "class_params" in params["model_info"]: + self.n_classes_ = len(params["model_info"]["class_params"]["class_to_label"]) + self.n_features_in_ = len(params["features_info"]["float_features"]) def _convert_model_from_lightgbm(self, booster): lgbm_params = d4p.get_lightgbm_params(booster) @@ -85,8 +87,10 @@ def _convert_model_from_catboost(self, booster): self._get_params_from_catboost(catboost_params) def _convert_model(self, model): - (submodule_name, class_name) = (model.__class__.__module__, - model.__class__.__name__) + (submodule_name, class_name) = ( + model.__class__.__module__, + model.__class__.__name__, + ) self_class_name = self.__class__.__name__ # Build GBTDAALClassifier from LightGBM @@ -94,82 +98,101 @@ def _convert_model(self, model): if self_class_name == "GBTDAALClassifier": self._convert_model_from_lightgbm(model.booster_) else: - raise TypeError(f"Only GBTDAALClassifier can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALClassifier can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALClassifier from XGBoost elif (submodule_name, class_name) == ("xgboost.sklearn", "XGBClassifier"): if self_class_name == "GBTDAALClassifier": self._convert_model_from_xgboost(model.get_booster()) else: - raise TypeError(f"Only GBTDAALClassifier can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALClassifier can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALClassifier from CatBoost elif (submodule_name, class_name) == ("catboost.core", "CatBoostClassifier"): if self_class_name == "GBTDAALClassifier": self._convert_model_from_catboost(model) else: - raise TypeError(f"Only GBTDAALClassifier can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALClassifier can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALRegressor from LightGBM elif (submodule_name, class_name) == ("lightgbm.sklearn", "LGBMRegressor"): if self_class_name == "GBTDAALRegressor": self._convert_model_from_lightgbm(model.booster_) else: - raise TypeError(f"Only GBTDAALRegressor can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALRegressor can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALRegressor from XGBoost elif (submodule_name, class_name) == ("xgboost.sklearn", "XGBRegressor"): if self_class_name == "GBTDAALRegressor": self._convert_model_from_xgboost(model.get_booster()) else: - raise TypeError(f"Only GBTDAALRegressor can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALRegressor can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALRegressor from CatBoost elif (submodule_name, class_name) == ("catboost.core", "CatBoostRegressor"): if self_class_name == "GBTDAALRegressor": self._convert_model_from_catboost(model) else: - raise TypeError(f"Only GBTDAALRegressor can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALRegressor can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALModel from LightGBM elif (submodule_name, class_name) == ("lightgbm.basic", "Booster"): if self_class_name == "GBTDAALModel": self._convert_model_from_lightgbm(model) else: - raise TypeError(f"Only GBTDAALModel can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALModel can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALModel from XGBoost elif (submodule_name, class_name) == ("xgboost.core", "Booster"): if self_class_name == "GBTDAALModel": self._convert_model_from_xgboost(model) else: - raise TypeError(f"Only GBTDAALModel can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALModel can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) # Build GBTDAALModel from CatBoost elif (submodule_name, class_name) == ("catboost.core", "CatBoost"): if self_class_name == "GBTDAALModel": self._convert_model_from_catboost(model) else: - raise TypeError(f"Only GBTDAALModel can be created from\ - {submodule_name}.{class_name} (got {self_class_name})") + raise TypeError( + f"Only GBTDAALModel can be created from\ + {submodule_name}.{class_name} (got {self_class_name})" + ) else: raise TypeError(f"Unknown model format {submodule_name}.{class_name}") def _predict_classification(self, X, fptype, resultsToEvaluate): if X.shape[1] != self.n_features_in_: - raise ValueError('Shape of input is different from what was seen in `fit`') + raise ValueError("Shape of input is different from what was seen in `fit`") - if not hasattr(self, 'daal_model_'): - raise ValueError(( - "The class {} instance does not have 'daal_model_' attribute set. " - "Call 'fit' with appropriate arguments before using this method.") - .format(type(self).__name__)) + if not hasattr(self, "daal_model_"): + raise ValueError( + ( + "The class {} instance does not have 'daal_model_' attribute set. " + "Call 'fit' with appropriate arguments before using this method." + ).format(type(self).__name__) + ) # Prediction predict_algo = d4p.gbt_classification_prediction( - fptype=fptype, - nClasses=self.n_classes_, - resultsToEvaluate=resultsToEvaluate) + fptype=fptype, nClasses=self.n_classes_, resultsToEvaluate=resultsToEvaluate + ) predict_result = predict_algo.compute(X, self.daal_model_) if resultsToEvaluate == "computeClassLabels": @@ -179,13 +202,15 @@ def _predict_classification(self, X, fptype, resultsToEvaluate): def _predict_regression(self, X, fptype): if X.shape[1] != self.n_features_in_: - raise ValueError('Shape of input is different from what was seen in `fit`') - - if not hasattr(self, 'daal_model_'): - raise ValueError(( - "The class {} instance does not have 'daal_model_' attribute set. " - "Call 'fit' with appropriate arguments before using this method.").format( - type(self).__name__)) + raise ValueError("Shape of input is different from what was seen in `fit`") + + if not hasattr(self, "daal_model_"): + raise ValueError( + ( + "The class {} instance does not have 'daal_model_' attribute set. " + "Call 'fit' with appropriate arguments before using this method." + ).format(type(self).__name__) + ) # Prediction predict_algo = d4p.gbt_regression_prediction(fptype=fptype) diff --git a/daal4py/oneapi/__init__.py b/daal4py/oneapi/__init__.py index d76d4abb57..973060f7f6 100644 --- a/daal4py/oneapi/__init__.py +++ b/daal4py/oneapi/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import platform + if "Windows" in platform.system(): import os import sys @@ -23,13 +24,14 @@ current_path = os.path.dirname(__file__) - sitepackages_path = sysconfig.get_paths()['purelib'] - installed_package_path = os.path.join(sitepackages_path, 'daal4py', 'oneapi') + sitepackages_path = sysconfig.get_paths()["purelib"] + installed_package_path = os.path.join(sitepackages_path, "daal4py", "oneapi") if sys.version_info.minor >= 8: - if 'DPCPPROOT' in os.environ: - dpcpp_rt_root_bin = os.path.join(os.environ['DPCPPROOT'], "windows", "bin") - dpcpp_rt_root_redist = os.path.join(os.environ['DPCPPROOT'], "windows", - "redist", "intel64_win", "compiler") + if "DPCPPROOT" in os.environ: + dpcpp_rt_root_bin = os.path.join(os.environ["DPCPPROOT"], "windows", "bin") + dpcpp_rt_root_redist = os.path.join( + os.environ["DPCPPROOT"], "windows", "redist", "intel64_win", "compiler" + ) if os.path.exists(dpcpp_rt_root_bin): os.add_dll_directory(dpcpp_rt_root_bin) if os.path.exists(dpcpp_rt_root_redist): @@ -37,28 +39,29 @@ os.add_dll_directory(current_path) if os.path.exists(installed_package_path): os.add_dll_directory(installed_package_path) - os.environ['PATH'] = current_path + os.pathsep + os.environ['PATH'] - os.environ['PATH'] = installed_package_path + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = current_path + os.pathsep + os.environ["PATH"] + os.environ["PATH"] = installed_package_path + os.pathsep + os.environ["PATH"] try: from daal4py._oneapi import * from daal4py._oneapi import ( - _get_sycl_ctxt, _get_device_name_sycl_ctxt, + _get_in_sycl_ctxt, + _get_sycl_ctxt, _get_sycl_ctxt_params, - _get_in_sycl_ctxt ) except ModuleNotFoundError: raise except ImportError: import daal4py - version = daal4py._get__version__()[1:-1].split(', ') + + version = daal4py._get__version__()[1:-1].split(", ") major_version, minor_version = version[0], version[1] raise ImportError( - f'dpcpp_cpp_rt >= {major_version}.{minor_version} ' - 'has to be installed or upgraded to use this module.\n' - 'You can download or upgrade it using the following commands:\n' - f'`pip install --upgrade dpcpp_cpp_rt>={major_version}.{minor_version}.*` ' - 'or ' - f'`conda install -c intel dpcpp_cpp_rt>={major_version}.{minor_version}.*`' + f"dpcpp_cpp_rt >= {major_version}.{minor_version} " + "has to be installed or upgraded to use this module.\n" + "You can download or upgrade it using the following commands:\n" + f"`pip install --upgrade dpcpp_cpp_rt>={major_version}.{minor_version}.*` " + "or " + f"`conda install -c intel dpcpp_cpp_rt>={major_version}.{minor_version}.*`" ) diff --git a/daal4py/sklearn/__init__.py b/daal4py/sklearn/__init__.py index b10e26370a..92cad6beef 100755 --- a/daal4py/sklearn/__init__.py +++ b/daal4py/sklearn/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .monkeypatch.dispatcher import enable as patch_sklearn +from .monkeypatch.dispatcher import _get_map_of_algorithms as sklearn_patch_map +from .monkeypatch.dispatcher import _patch_names as sklearn_patch_names from .monkeypatch.dispatcher import disable as unpatch_sklearn +from .monkeypatch.dispatcher import enable as patch_sklearn from .monkeypatch.dispatcher import patch_is_enabled as sklearn_is_patched -from .monkeypatch.dispatcher import _patch_names as sklearn_patch_names -from .monkeypatch.dispatcher import _get_map_of_algorithms as sklearn_patch_map __all__ = [ - 'cluster', 'decomposition', 'ensemble', 'linear_model', - 'manifold', 'metrics', 'model_selection', 'neighbors', - 'patch_sklearn', 'sklearn_is_patched', 'sklearn_patch_map', - 'sklearn_patch_names', 'svm', 'tree', 'unpatch_sklearn', 'utils' + "cluster", + "decomposition", + "ensemble", + "linear_model", + "manifold", + "metrics", + "model_selection", + "neighbors", + "patch_sklearn", + "sklearn_is_patched", + "sklearn_patch_map", + "sklearn_patch_names", + "svm", + "tree", + "unpatch_sklearn", + "utils", ] diff --git a/daal4py/sklearn/_device_offload.py b/daal4py/sklearn/_device_offload.py index df6b13458b..1fb3bd93f4 100644 --- a/daal4py/sklearn/_device_offload.py +++ b/daal4py/sklearn/_device_offload.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,21 +12,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from functools import wraps try: from sklearnex._config import get_config - from sklearnex._device_offload import (_get_global_queue, - _transfer_to_host, - _copy_to_usm) + from sklearnex._device_offload import ( + _copy_to_usm, + _get_global_queue, + _transfer_to_host, + ) + _sklearnex_available = True except ImportError: import logging - logging.warning('Device support is limited in daal4py patching. ' - 'Use Intel(R) Extension for Scikit-learn* ' - 'for full experience.') + + logging.warning( + "Device support is limited in daal4py patching. " + "Use Intel(R) Extension for Scikit-learn* " + "for full experience." + ) _sklearnex_available = False @@ -42,9 +48,7 @@ def _extract_usm_iface(*args, **kwargs): allargs = (*args, *kwargs.values()) if len(allargs) == 0: return None - return getattr(allargs[0], - '__sycl_usm_array_interface__', - None) + return getattr(allargs[0], "__sycl_usm_array_interface__", None) def _run_on_device(func, queue, obj=None, *args, **kwargs): @@ -54,13 +58,15 @@ def dispatch_by_obj(obj, func, *args, **kwargs): return func(*args, **kwargs) if queue is not None: - from daal4py.oneapi import sycl_context, _get_in_sycl_ctxt + from daal4py.oneapi import _get_in_sycl_ctxt, sycl_context if _get_in_sycl_ctxt() is False: - host_offload = get_config()['allow_fallback_to_host'] + host_offload = get_config()["allow_fallback_to_host"] - with sycl_context('gpu' if queue.sycl_device.is_gpu else 'cpu', - host_offload_on_fail=host_offload): + with sycl_context( + "gpu" if queue.sycl_device.is_gpu else "cpu", + host_offload_on_fail=host_offload, + ): return dispatch_by_obj(obj, func, *args, **kwargs) return dispatch_by_obj(obj, func, *args, **kwargs) @@ -72,19 +78,23 @@ def wrapper_impl(obj, *args, **kwargs): usm_iface = _extract_usm_iface(*args, **kwargs) q, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) result = _run_on_device(func, q, obj, *hostargs, **hostkwargs) - if usm_iface is not None and hasattr(result, '__array_interface__'): + if usm_iface is not None and hasattr(result, "__array_interface__"): return _copy_to_usm(q, result) return result return _run_on_device(func, None, obj, *args, **kwargs) if freefunc: + @wraps(func) def wrapper_free(*args, **kwargs): return wrapper_impl(None, *args, **kwargs) + return wrapper_free @wraps(func) def wrapper_with_self(self, *args, **kwargs): return wrapper_impl(self, *args, **kwargs) + return wrapper_with_self + return decorator diff --git a/daal4py/sklearn/_utils.py b/daal4py/sklearn/_utils.py index d257185e90..01730e21d9 100644 --- a/daal4py/sklearn/_utils.py +++ b/daal4py/sklearn/_utils.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,37 +12,41 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np -import sys import os +import sys import warnings +import numpy as np from numpy.lib.recfunctions import require_fields +from sklearn import __version__ as sklearn_version from daal4py import _get__daal_link_version__ as dv -from sklearn import __version__ as sklearn_version + try: from packaging.version import Version except ImportError: from distutils.version import LooseVersion as Version + import logging try: from pandas import DataFrame from pandas.core.dtypes.cast import find_common_type + pandas_is_imported = True except (ImportError, ModuleNotFoundError): pandas_is_imported = False try: from daal4py.oneapi import is_in_sycl_ctxt as is_in_ctx + ctx_imported = True except (ImportError, ModuleNotFoundError): ctx_imported = False -oneapi_is_available = 'daal4py.oneapi' in sys.modules +oneapi_is_available = "daal4py.oneapi" in sys.modules if oneapi_is_available: from daal4py.oneapi import _get_device_name_sycl_ctxt @@ -53,11 +57,15 @@ def set_idp_sklearn_verbose(): if logLevel is not None: logging.basicConfig( stream=sys.stdout, - format='%(levelname)s: %(message)s', level=logLevel.upper()) + format="%(levelname)s: %(message)s", + level=logLevel.upper(), + ) except Exception: - warnings.warn('Unknown level "{}" for logging.\n' - 'Please, use one of "CRITICAL", "ERROR", ' - '"WARNING", "INFO", "DEBUG".'.format(logLevel)) + warnings.warn( + 'Unknown level "{}" for logging.\n' + 'Please, use one of "CRITICAL", "ERROR", ' + '"WARNING", "INFO", "DEBUG".'.format(logLevel) + ) def daal_check_version(rule): @@ -83,7 +91,7 @@ def daal_check_version(rule): def sklearn_check_version(ver): if ver in sklearn_versions_map.keys(): return sklearn_versions_map[ver] - if hasattr(Version(ver), 'base_version'): + if hasattr(Version(ver), "base_version"): base_sklearn_version = Version(sklearn_version).base_version res = bool(Version(base_sklearn_version) >= Version(ver)) else: @@ -111,7 +119,7 @@ def getFPType(X): dt = find_common_type(X.dtypes.tolist()) return parse_dtype(dt) - dt = getattr(X, 'dtype', None) + dt = getattr(X, "dtype", None) return parse_dtype(dt) @@ -128,15 +136,16 @@ def get_patch_message(s): message = "running accelerated version on " if oneapi_is_available: dev = _get_device_name_sycl_ctxt() - if dev == 'cpu' or dev is None: - message += 'CPU' - elif dev == 'gpu': - message += 'GPU' + if dev == "cpu" or dev is None: + message += "CPU" + elif dev == "gpu": + message += "GPU" else: - raise ValueError(f"Unexpected device name {dev}." - " Supported types are cpu and gpu") + raise ValueError( + f"Unexpected device name {dev}." " Supported types are cpu and gpu" + ) else: - message += 'CPU' + message += "CPU" elif s == "sklearn": message = "fallback to original Scikit-learn" @@ -145,7 +154,8 @@ def get_patch_message(s): else: raise ValueError( f"Invalid input - expected one of 'daal','sklearn'," - f" 'sklearn_after_daal', got {s}") + f" 'sklearn_after_daal', got {s}" + ) return message @@ -182,14 +192,18 @@ def check_tree_nodes(tree_nodes): def convert_to_old_tree_nodes(tree_nodes): # conversion from sklearn>=1.3 tree nodes format to previous format: # removal of 'missing_go_to_left' field from node dtype - new_field = 'missing_go_to_left' + new_field = "missing_go_to_left" new_dtype = tree_nodes.dtype - old_dtype = np.dtype([ - (key, value[0]) for key, value in - new_dtype.fields.items() if key != new_field]) + old_dtype = np.dtype( + [ + (key, value[0]) + for key, value in new_dtype.fields.items() + if key != new_field + ] + ) return require_fields(tree_nodes, old_dtype) - if sklearn_check_version('1.3'): + if sklearn_check_version("1.3"): return tree_nodes else: return convert_to_old_tree_nodes(tree_nodes) @@ -200,7 +214,7 @@ def __init__(self, scope_name): self.scope_name = scope_name self.patching_is_enabled = True self.messages = [] - self.logger = logging.getLogger('sklearnex') + self.logger = logging.getLogger("sklearnex") def _iter_conditions(self, conditions_and_messages): result = [] @@ -212,7 +226,8 @@ def _iter_conditions(self, conditions_and_messages): def and_conditions(self, conditions_and_messages, conditions_merging=all): self.patching_is_enabled &= conditions_merging( - self._iter_conditions(conditions_and_messages)) + self._iter_conditions(conditions_and_messages) + ) return self.patching_is_enabled def and_condition(self, condition, message): @@ -220,7 +235,8 @@ def and_condition(self, condition, message): def or_conditions(self, conditions_and_messages, conditions_merging=all): self.patching_is_enabled |= conditions_merging( - self._iter_conditions(conditions_and_messages)) + self._iter_conditions(conditions_and_messages) + ) return self.patching_is_enabled def write_log(self): @@ -228,11 +244,13 @@ def write_log(self): self.logger.info(f"{self.scope_name}: {get_patch_message('daal')}") else: self.logger.debug( - f'{self.scope_name}: debugging for the patch is enabled to track' - ' the usage of Intel® oneAPI Data Analytics Library (oneDAL)') + f"{self.scope_name}: debugging for the patch is enabled to track" + " the usage of Intel® oneAPI Data Analytics Library (oneDAL)" + ) for message in self.messages: self.logger.debug( - f'{self.scope_name}: patching failed with cause - {message}') + f"{self.scope_name}: patching failed with cause - {message}" + ) self.logger.info(f"{self.scope_name}: {get_patch_message('sklearn')}") def get_status(self, logs=False): diff --git a/daal4py/sklearn/cluster/__init__.py b/daal4py/sklearn/cluster/__init__.py index ed774f234f..dd7ceeb93b 100644 --- a/daal4py/sklearn/cluster/__init__.py +++ b/daal4py/sklearn/cluster/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .k_means import KMeans from .dbscan import DBSCAN -__all__ = ['KMeans', 'DBSCAN'] +from .k_means import KMeans + +__all__ = ["KMeans", "DBSCAN"] diff --git a/daal4py/sklearn/cluster/_dbscan.py b/daal4py/sklearn/cluster/_dbscan.py index 8a10284977..0cee113de5 100644 --- a/daal4py/sklearn/cluster/_dbscan.py +++ b/daal4py/sklearn/cluster/_dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np -from scipy import sparse as sp import numbers +import numpy as np +from scipy import sparse as sp +from sklearn.cluster import DBSCAN as DBSCAN_original from sklearn.utils import check_array from sklearn.utils.validation import _check_sample_weight -from sklearn.cluster import DBSCAN as DBSCAN_original - import daal4py -from daal4py.sklearn._utils import ( - make2d, getFPType, PatchingConditionsChain) +from daal4py.sklearn._utils import PatchingConditionsChain, getFPType, make2d from .._device_offload import support_usm_ndarray from .._utils import sklearn_check_version -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar @@ -40,12 +38,12 @@ def _daal_dbscan(X, eps=0.5, min_samples=5, sample_weight=None): fpt = getFPType(XX) alg = daal4py.dbscan( - method='defaultDense', + method="defaultDense", fptype=fpt, epsilon=float(eps), minObservations=int(min_samples), memorySavingMode=False, - resultsToCompute="computeCoreIndices" + resultsToCompute="computeCoreIndices", ) daal_res = alg.compute(XX, ww) @@ -189,16 +187,17 @@ class DBSCAN(DBSCAN_original): >>> clustering DBSCAN(eps=3, min_samples=2) """ - if sklearn_check_version('1.2'): + + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**DBSCAN_original._parameter_constraints} def __init__( self, eps=0.5, min_samples=5, - metric='euclidean', + metric="euclidean", metric_params=None, - algorithm='auto', + algorithm="auto", leaf_size=30, p=None, n_jobs=None, @@ -282,26 +281,29 @@ def fit(self, X, y=None, sample_weight=None): if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.DBSCAN.fit") - _dal_ready = _patching_status.and_conditions([ - (self.algorithm in ['auto', 'brute'], - f"'{self.algorithm}' algorithm is not supported. " - "Only 'auto' and 'brute' algorithms are supported"), - (self.metric == 'euclidean' or (self.metric == 'minkowski' and self.p == 2), - f"'{self.metric}' (p={self.p}) metric is not supported. " - "Only 'euclidean' or 'minkowski' with p=2 metrics are supported."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.DBSCAN.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.algorithm in ["auto", "brute"], + f"'{self.algorithm}' algorithm is not supported. " + "Only 'auto' and 'brute' algorithms are supported", + ), + ( + self.metric == "euclidean" + or (self.metric == "minkowski" and self.p == 2), + f"'{self.metric}' (p={self.p}) metric is not supported. " + "Only 'euclidean' or 'minkowski' with p=2 metrics are supported.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ] + ) _patching_status.write_log() if _dal_ready: - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) core_ind, assignments = _daal_dbscan( - X, - self.eps, - self.min_samples, - sample_weight=sample_weight + X, self.eps, self.min_samples, sample_weight=sample_weight ) self.core_sample_indices_ = core_ind self.labels_ = assignments diff --git a/daal4py/sklearn/cluster/_k_means_0_22.py b/daal4py/sklearn/cluster/_k_means_0_22.py index 4856cbb4dd..f70cce8ef9 100644 --- a/daal4py/sklearn/cluster/_k_means_0_22.py +++ b/daal4py/sklearn/cluster/_k_means_0_22.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from scipy import sparse as sp - -from sklearn.utils import (check_random_state, check_array) +from sklearn.utils import check_array, check_random_state from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.utils.validation import (check_is_fitted, _num_samples) +from sklearn.utils.validation import _num_samples, check_is_fitted try: - from sklearn.cluster._k_means import ( - k_means, _labels_inertia, _validate_center_shape) + from sklearn.cluster._k_means import _labels_inertia, _validate_center_shape, k_means except ModuleNotFoundError: - from sklearn.cluster._kmeans import ( - k_means, _labels_inertia, _validate_center_shape) + from sklearn.cluster._kmeans import k_means, _labels_inertia, _validate_center_shape -from sklearn.utils.extmath import row_norms import warnings from sklearn.cluster import KMeans as KMeans_original +from sklearn.utils.extmath import row_norms import daal4py -from .._utils import ( - getFPType, daal_check_version, PatchingConditionsChain) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, daal_check_version, getFPType def _tolerance(X, rtol): @@ -52,34 +49,38 @@ def _tolerance(X, rtol): def _daal4py_compute_starting_centroids( - X, X_fptype, nClusters, cluster_centers_0, random_state): - + X, X_fptype, nClusters, cluster_centers_0, random_state +): def is_string(s, target_str): return isinstance(s, str) and s == target_str deterministic = False - if is_string(cluster_centers_0, 'k-means++'): - _seed = random_state.randint(np.iinfo('i').max) + if is_string(cluster_centers_0, "k-means++"): + _seed = random_state.randint(np.iinfo("i").max) daal_engine = daal4py.engines_mt19937( - fptype=X_fptype, method='defaultDense', seed=_seed) + fptype=X_fptype, method="defaultDense", seed=_seed + ) _n_local_trials = 2 + int(np.log(nClusters)) - kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, - nTrials=_n_local_trials, - method='plusPlusDense', engine=daal_engine) + kmeans_init = daal4py.kmeans_init( + nClusters, + fptype=X_fptype, + nTrials=_n_local_trials, + method="plusPlusDense", + engine=daal_engine, + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif is_string(cluster_centers_0, 'random'): - _seed = random_state.randint(np.iinfo('i').max) + elif is_string(cluster_centers_0, "random"): + _seed = random_state.randint(np.iinfo("i").max) daal_engine = daal4py.engines_mt19937( - seed=_seed, fptype=X_fptype, method='defaultDense') + seed=_seed, fptype=X_fptype, method="defaultDense" + ) kmeans_init = daal4py.kmeans_init( - nClusters, - fptype=X_fptype, - method='randomDense', - engine=daal_engine) + nClusters, fptype=X_fptype, method="randomDense", engine=daal_engine + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif hasattr(cluster_centers_0, '__array__'): + elif hasattr(cluster_centers_0, "__array__"): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) @@ -89,60 +90,75 @@ def is_string(s, target_str): cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr - elif is_string(cluster_centers_0, 'deterministic'): + elif is_string(cluster_centers_0, "deterministic"): deterministic = True kmeans_init = daal4py.kmeans_init( - nClusters, fptype=X_fptype, method='defaultDense') + nClusters, fptype=X_fptype, method="defaultDense" + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError( "Cluster centers should either be 'k-means++'," - " 'random', 'deterministic' or an array") + " 'random', 'deterministic' or an array" + ) return deterministic, centroids_ -def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double", - method="lloydDense", accuracyThreshold=0.0, - resultsToEvaluate="computeCentroids", gamma=1.0): +def _daal4py_kmeans_compatibility( + nClusters, + maxIterations, + fptype="double", + method="lloydDense", + accuracyThreshold=0.0, + resultsToEvaluate="computeCentroids", + gamma=1.0, +): kmeans_algo = None - if daal_check_version(((2020, 'P', 2), (2021, 'B', 107))): - kmeans_algo = daal4py.kmeans(nClusters=nClusters, - maxIterations=maxIterations, - fptype=fptype, - resultsToEvaluate=resultsToEvaluate, - accuracyThreshold=accuracyThreshold, - method=method, - gamma=gamma) + if daal_check_version(((2020, "P", 2), (2021, "B", 107))): + kmeans_algo = daal4py.kmeans( + nClusters=nClusters, + maxIterations=maxIterations, + fptype=fptype, + resultsToEvaluate=resultsToEvaluate, + accuracyThreshold=accuracyThreshold, + method=method, + gamma=gamma, + ) else: - assigFlag = 'computeAssignments' in resultsToEvaluate - kmeans_algo = daal4py.kmeans(nClusters=nClusters, - maxIterations=maxIterations, - fptype=fptype, - assignFlag=assigFlag, - accuracyThreshold=accuracyThreshold, - method=method, - gamma=gamma) + assigFlag = "computeAssignments" in resultsToEvaluate + kmeans_algo = daal4py.kmeans( + nClusters=nClusters, + maxIterations=maxIterations, + fptype=fptype, + assignFlag=assigFlag, + accuracyThreshold=accuracyThreshold, + method=method, + gamma=gamma, + ) return kmeans_algo -def _daal4py_k_means_predict(X, nClusters, centroids, - resultsToEvaluate='computeAssignments'): +def _daal4py_k_means_predict( + X, nClusters, centroids, resultsToEvaluate="computeAssignments" +): X_fptype = getFPType(X) kmeans_algo = _daal4py_kmeans_compatibility( nClusters=nClusters, maxIterations=0, fptype=X_fptype, resultsToEvaluate=resultsToEvaluate, - method='defaultDense') + method="defaultDense", + ) res = kmeans_algo.compute(X, centroids) return res.assignments[:, 0], res.objectiveFunction[0, 0] -def _daal4py_k_means_fit(X, nClusters, numIterations, - tol, cluster_centers_0, n_init, random_state): +def _daal4py_k_means_fit( + X, nClusters, numIterations, tol, cluster_centers_0, n_init, random_state +): if numIterations < 0: raise ValueError("Wrong iterations number") @@ -157,12 +173,14 @@ def _daal4py_k_means_fit(X, nClusters, numIterations, maxIterations=numIterations, accuracyThreshold=abs_tol, fptype=X_fptype, - resultsToEvaluate='computeCentroids', - method='defaultDense') + resultsToEvaluate="computeCentroids", + method="defaultDense", + ) for k in range(n_init): deterministic, starting_centroids_ = _daal4py_compute_starting_centroids( - X, X_fptype, nClusters, cluster_centers_0, random_state) + X, X_fptype, nClusters, cluster_centers_0, random_state + ) res = kmeans_algo.compute(X, starting_centroids_) @@ -175,14 +193,17 @@ def _daal4py_k_means_fit(X, nClusters, numIterations, best_n_iter = int(res.nIterations[0, 0]) if deterministic and n_init != 1: warnings.warn( - 'Explicit initial center position passed: ' - 'performing only one init in k-means instead of n_init=%d' - % n_init, RuntimeWarning, stacklevel=2) + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) break - flag_compute = 'computeAssignments|computeExactObjectiveFunction' + flag_compute = "computeAssignments|computeExactObjectiveFunction" best_labels, best_inertia = _daal4py_k_means_predict( - X, nClusters, best_cluster_centers, flag_compute) + X, nClusters, best_cluster_centers, flag_compute + ) return best_cluster_centers, best_labels, best_inertia, best_n_iter @@ -205,73 +226,103 @@ def _fit(self, X, y=None, sample_weight=None): """ if self.n_init <= 0: - raise ValueError("Invalid number of initializations." - " n_init=%d must be bigger than zero." % self.n_init) + raise ValueError( + "Invalid number of initializations." + " n_init=%d must be bigger than zero." % self.n_init + ) random_state = check_random_state(self.random_state) if self.max_iter <= 0: - raise ValueError('Number of iterations should be a positive number,' - ' got %d instead' % self.max_iter) + raise ValueError( + "Number of iterations should be a positive number," + " got %d instead" % self.max_iter + ) - if self.precompute_distances == 'auto': + if self.precompute_distances == "auto": precompute_distances = False elif isinstance(self.precompute_distances, bool): precompute_distances = self.precompute_distances else: - raise ValueError("precompute_distances should be 'auto' or True/False" - ", but a value of %r was passed" % - self.precompute_distances) + raise ValueError( + "precompute_distances should be 'auto' or True/False" + ", but a value of %r was passed" % self.precompute_distances + ) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.fit") - _dal_ready = _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (not precompute_distances, "The precomputing of distances is not supported.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.fit") + _dal_ready = _patching_status.and_conditions( + [ + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (not precompute_distances, "The precomputing of distances is not supported."), + ] + ) if _dal_ready: X_len = _num_samples(X) - _dal_ready = _patching_status.and_conditions([ - (self.n_clusters <= X_len, - "The number of clusters is larger than the number of samples in X.") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + self.n_clusters <= X_len, + "The number of clusters is larger than the number of samples in X.", + ) + ] + ) if _dal_ready and sample_weight is not None: sample_weight = np.asarray(sample_weight) - _dal_ready = _patching_status.and_conditions([ - (sample_weight.shape == (X_len,), - "Sample weights do not have the same length as X."), - (np.allclose(sample_weight, np.ones_like(sample_weight)), - "Sample weights are not ones.") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + sample_weight.shape == (X_len,), + "Sample weights do not have the same length as X.", + ), + ( + np.allclose(sample_weight, np.ones_like(sample_weight)), + "Sample weights are not ones.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ - k_means( - X, n_clusters=self.n_clusters, sample_weight=sample_weight, - init=self.init, n_init=self.n_init, max_iter=self.max_iter, - verbose=self.verbose, precompute_distances=precompute_distances, - tol=self.tol, random_state=random_state, copy_x=self.copy_x, - n_jobs=self.n_jobs, algorithm=self.algorithm, - return_n_iter=True) + self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = k_means( + X, + n_clusters=self.n_clusters, + sample_weight=sample_weight, + init=self.init, + n_init=self.n_init, + max_iter=self.max_iter, + verbose=self.verbose, + precompute_distances=precompute_distances, + tol=self.tol, + random_state=random_state, + copy_x=self.copy_x, + n_jobs=self.n_jobs, + algorithm=self.algorithm, + return_n_iter=True, + ) else: X = check_array( X, - accept_sparse='csr', dtype=[np.float64, np.float32], + accept_sparse="csr", + dtype=[np.float64, np.float32], order="C" if self.copy_x else None, - copy=self.copy_x + copy=self.copy_x, ) self.n_features_in_ = X.shape[1] - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ - _daal4py_k_means_fit( - X, self.n_clusters, - self.max_iter, - self.tol, - self.init, - self.n_init, - random_state - ) + ( + self.cluster_centers_, + self.labels_, + self.inertia_, + self.n_iter_, + ) = _daal4py_k_means_fit( + X, + self.n_clusters, + self.max_iter, + self.tol, + self.init, + self.n_init, + random_state, + ) return self @@ -300,35 +351,51 @@ def _predict(self, X, sample_weight=None): X = self._check_test_data(X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.predict") - _dal_ready = _patching_status.and_conditions([ - (sample_weight is None, "Sample weights are not supported."), - (hasattr(X, '__array__'), "X does not have '__array__' attribute.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.predict") + _dal_ready = _patching_status.and_conditions( + [ + (sample_weight is None, "Sample weights are not supported."), + (hasattr(X, "__array__"), "X does not have '__array__' attribute."), + ] + ) _patching_status.write_log() if _dal_ready: - return _daal4py_k_means_predict( - X, self.n_clusters, self.cluster_centers_)[0] + return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0] x_squared_norms = row_norms(X, squared=True) - return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_)[0] + return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[0] class KMeans(KMeans_original): __doc__ = KMeans_original.__doc__ - def __init__(self, n_clusters=8, init='k-means++', n_init=10, - max_iter=300, tol=1e-4, precompute_distances='auto', - verbose=0, random_state=None, copy_x=True, - n_jobs=None, algorithm='auto'): - + def __init__( + self, + n_clusters=8, + init="k-means++", + n_init=10, + max_iter=300, + tol=1e-4, + precompute_distances="auto", + verbose=0, + random_state=None, + copy_x=True, + n_jobs=None, + algorithm="auto", + ): super(KMeans, self).__init__( - n_clusters=n_clusters, init=init, max_iter=max_iter, - tol=tol, precompute_distances=precompute_distances, - n_init=n_init, verbose=verbose, random_state=random_state, - copy_x=copy_x, n_jobs=n_jobs, algorithm=algorithm) + n_clusters=n_clusters, + init=init, + max_iter=max_iter, + tol=tol, + precompute_distances=precompute_distances, + n_init=n_init, + verbose=verbose, + random_state=random_state, + copy_x=copy_x, + n_jobs=n_jobs, + algorithm=algorithm, + ) @support_usm_ndarray() def fit(self, X, y=None, sample_weight=None): diff --git a/daal4py/sklearn/cluster/_k_means_0_23.py b/daal4py/sklearn/cluster/_k_means_0_23.py index 3b2497fcae..48b7fb790e 100755 --- a/daal4py/sklearn/cluster/_k_means_0_23.py +++ b/daal4py/sklearn/cluster/_k_means_0_23.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,38 +12,33 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers -from scipy import sparse as sp - -from sklearn.utils import check_random_state, check_array -from sklearn.utils.sparsefuncs import mean_variance_axis -from sklearn.utils.validation import ( - check_is_fitted, - _num_samples, - _deprecate_positional_args) +import warnings +import numpy as np +from scipy import sparse as sp +from sklearn.cluster import KMeans as KMeans_original from sklearn.cluster._kmeans import _labels_inertia -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads - from sklearn.exceptions import ConvergenceWarning +from sklearn.utils import check_array, check_random_state +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.extmath import row_norms -import warnings - -from sklearn.cluster import KMeans as KMeans_original +from sklearn.utils.sparsefuncs import mean_variance_axis +from sklearn.utils.validation import ( + _deprecate_positional_args, + _num_samples, + check_is_fitted, +) import daal4py -from .._utils import ( - getFPType, - sklearn_check_version, - PatchingConditionsChain) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version -if sklearn_check_version('1.1'): - from sklearn.utils.validation import ( - _check_sample_weight, _is_arraylike_not_scalar) +if sklearn_check_version("1.1"): + from sklearn.utils.validation import _check_sample_weight, _is_arraylike_not_scalar def _validate_center_shape(X, n_centers, centers): @@ -51,11 +46,13 @@ def _validate_center_shape(X, n_centers, centers): if centers.shape[0] != n_centers: raise ValueError( f"The shape of the initial centers {centers.shape} does not " - f"match the number of clusters {n_centers}.") + f"match the number of clusters {n_centers}." + ) if centers.shape[1] != X.shape[1]: raise ValueError( f"The shape of the initial centers {centers.shape} does not " - f"match the number of features of the data {X.shape[1]}.") + f"match the number of features of the data {X.shape[1]}." + ) def _tolerance(X, rtol): @@ -71,23 +68,20 @@ def _tolerance(X, rtol): def _daal4py_compute_starting_centroids( - X, - X_fptype, - nClusters, - cluster_centers_0, - verbose, - random_state + X, X_fptype, nClusters, cluster_centers_0, verbose, random_state ): def is_string(s, target_str): return isinstance(s, str) and s == target_str + is_sparse = sp.isspmatrix(X) deterministic = False - if is_string(cluster_centers_0, 'k-means++'): - _seed = random_state.randint(np.iinfo('i').max) + if is_string(cluster_centers_0, "k-means++"): + _seed = random_state.randint(np.iinfo("i").max) plus_plus_method = "plusPlusCSR" if is_sparse else "plusPlusDense" daal_engine = daal4py.engines_mt19937( - fptype=X_fptype, method="defaultDense", seed=_seed) + fptype=X_fptype, method="defaultDense", seed=_seed + ) _n_local_trials = 2 + int(np.log(nClusters)) kmeans_init = daal4py.kmeans_init( nClusters, @@ -98,11 +92,12 @@ def is_string(s, target_str): ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif is_string(cluster_centers_0, 'random'): - _seed = random_state.randint(np.iinfo('i').max) + elif is_string(cluster_centers_0, "random"): + _seed = random_state.randint(np.iinfo("i").max) random_method = "randomCSR" if is_sparse else "randomDense" daal_engine = daal4py.engines_mt19937( - seed=_seed, fptype=X_fptype, method="defaultDense") + seed=_seed, fptype=X_fptype, method="defaultDense" + ) kmeans_init = daal4py.kmeans_init( nClusters, fptype=X_fptype, @@ -111,7 +106,7 @@ def is_string(s, target_str): ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids - elif hasattr(cluster_centers_0, '__array__'): + elif hasattr(cluster_centers_0, "__array__"): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) @@ -121,25 +116,33 @@ def is_string(s, target_str): cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr - elif is_string(cluster_centers_0, 'deterministic'): + elif is_string(cluster_centers_0, "deterministic"): deterministic = True default_method = "lloydCSR" if is_sparse else "defaultDense" kmeans_init = daal4py.kmeans_init( - nClusters, fptype=X_fptype, method=default_method) + nClusters, fptype=X_fptype, method=default_method + ) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError( f"init should be either 'k-means++', 'random', a ndarray or a " - f"callable, got '{cluster_centers_0}' instead.") + f"callable, got '{cluster_centers_0}' instead." + ) if verbose: print("Initialization complete") return deterministic, centroids_ -def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double", - method="lloydDense", accuracyThreshold=0.0, - resultsToEvaluate="computeCentroids", gamma=1.0): +def _daal4py_kmeans_compatibility( + nClusters, + maxIterations, + fptype="double", + method="lloydDense", + accuracyThreshold=0.0, + resultsToEvaluate="computeCentroids", + gamma=1.0, +): kmeans_algo = daal4py.kmeans( nClusters=nClusters, maxIterations=maxIterations, @@ -147,13 +150,14 @@ def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double", resultsToEvaluate=resultsToEvaluate, accuracyThreshold=accuracyThreshold, method=method, - gamma=gamma + gamma=gamma, ) return kmeans_algo -def _daal4py_k_means_predict(X, nClusters, centroids, - resultsToEvaluate='computeAssignments'): +def _daal4py_k_means_predict( + X, nClusters, centroids, resultsToEvaluate="computeAssignments" +): X_fptype = getFPType(X) is_sparse = sp.isspmatrix(X) method = "lloydCSR" if is_sparse else "defaultDense" @@ -170,8 +174,9 @@ def _daal4py_k_means_predict(X, nClusters, centroids, return res.assignments[:, 0], res.objectiveFunction[0, 0] -def _daal4py_k_means_fit(X, nClusters, numIterations, - tol, cluster_centers_0, n_init, verbose, random_state): +def _daal4py_k_means_fit( + X, nClusters, numIterations, tol, cluster_centers_0, n_init, verbose, random_state +): if numIterations < 0: raise ValueError("Wrong iterations number") @@ -179,15 +184,15 @@ def is_string(s, target_str): return isinstance(s, str) and s == target_str default_n_init = 10 - if n_init in ['auto', 'warn']: - if n_init == "warn" and sklearn_check_version('1.2'): + if n_init in ["auto", "warn"]: + if n_init == "warn" and sklearn_check_version("1.2"): warnings.warn( "The default value of `n_init` will change from " f"{default_n_init} to 'auto' in 1.4. Set the value of `n_init`" " explicitly to suppress the warning", FutureWarning, ) - if is_string(cluster_centers_0, 'k-means++'): + if is_string(cluster_centers_0, "k-means++"): n_init = 1 else: n_init = default_n_init @@ -202,13 +207,14 @@ def is_string(s, target_str): maxIterations=numIterations, accuracyThreshold=abs_tol, fptype=X_fptype, - resultsToEvaluate='computeCentroids', + resultsToEvaluate="computeCentroids", method=method, ) for k in range(n_init): deterministic, starting_centroids_ = _daal4py_compute_starting_centroids( - X, X_fptype, nClusters, cluster_centers_0, verbose, random_state) + X, X_fptype, nClusters, cluster_centers_0, verbose, random_state + ) res = kmeans_algo.compute(X, starting_centroids_) @@ -224,14 +230,17 @@ def is_string(s, target_str): best_n_iter = int(res.nIterations[0, 0]) if deterministic and n_init != 1: warnings.warn( - 'Explicit initial center position passed: ' - 'performing only one init in k-means instead of n_init=%d' - % n_init, RuntimeWarning, stacklevel=2) + "Explicit initial center position passed: " + "performing only one init in k-means instead of n_init=%d" % n_init, + RuntimeWarning, + stacklevel=2, + ) break - flag_compute = 'computeAssignments|computeExactObjectiveFunction' + flag_compute = "computeAssignments|computeExactObjectiveFunction" best_labels, best_inertia = _daal4py_k_means_predict( - X, nClusters, best_cluster_centers, flag_compute) + X, nClusters, best_cluster_centers, flag_compute + ) distinct_clusters = np.unique(best_labels).size if distinct_clusters < nClusters: @@ -239,7 +248,9 @@ def is_string(s, target_str): "Number of distinct clusters ({}) found smaller than " "n_clusters ({}). Possibly due to duplicate points " "in X.".format(distinct_clusters, nClusters), - ConvergenceWarning, stacklevel=2) + ConvergenceWarning, + stacklevel=2, + ) # for passing test case "test_kmeans_warns_less_centers_than_unique_points" return best_cluster_centers, best_labels, best_inertia, best_n_iter @@ -264,8 +275,8 @@ def _fit(self, X, y=None, sample_weight=None): """ init = self.init - if sklearn_check_version('1.1'): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.1"): + if sklearn_check_version("1.2"): self._validate_params() X = self._validate_data( @@ -277,7 +288,7 @@ def _fit(self, X, y=None, sample_weight=None): accept_large_sparse=False, ) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self._check_params_vs_input(X) else: self._check_params(X) @@ -292,99 +303,141 @@ def _fit(self, X, y=None, sample_weight=None): init = check_array(init, dtype=X.dtype, copy=True, order="C") self._validate_center_shape(X, init) else: - if hasattr(self, 'precompute_distances'): - if self.precompute_distances != 'deprecated': - if sklearn_check_version('0.24'): - warnings.warn("'precompute_distances' was deprecated in version " - "0.23 and will be removed in 1.0 (renaming of 0.25)." - " It has no effect", FutureWarning) - elif sklearn_check_version('0.23'): - warnings.warn("'precompute_distances' was deprecated in version " - "0.23 and will be removed in 0.25. It has no " - "effect", FutureWarning) + if hasattr(self, "precompute_distances"): + if self.precompute_distances != "deprecated": + if sklearn_check_version("0.24"): + warnings.warn( + "'precompute_distances' was deprecated in version " + "0.23 and will be removed in 1.0 (renaming of 0.25)." + " It has no effect", + FutureWarning, + ) + elif sklearn_check_version("0.23"): + warnings.warn( + "'precompute_distances' was deprecated in version " + "0.23 and will be removed in 0.25. It has no " + "effect", + FutureWarning, + ) self._n_threads = None - if hasattr(self, 'n_jobs'): - if self.n_jobs != 'deprecated': - if sklearn_check_version('0.24'): - warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 1.0 (renaming of 0.25).", FutureWarning) - elif sklearn_check_version('0.23'): - warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" - " removed in 0.25.", FutureWarning) + if hasattr(self, "n_jobs"): + if self.n_jobs != "deprecated": + if sklearn_check_version("0.24"): + warnings.warn( + "'n_jobs' was deprecated in version 0.23 and will be" + " removed in 1.0 (renaming of 0.25).", + FutureWarning, + ) + elif sklearn_check_version("0.23"): + warnings.warn( + "'n_jobs' was deprecated in version 0.23 and will be" + " removed in 0.25.", + FutureWarning, + ) self._n_threads = self.n_jobs self._n_threads = _openmp_effective_n_threads(self._n_threads) if self.n_init <= 0: - raise ValueError( - f"n_init should be > 0, got {self.n_init} instead.") + raise ValueError(f"n_init should be > 0, got {self.n_init} instead.") random_state = check_random_state(self.random_state) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if self.max_iter <= 0: - raise ValueError( - f"max_iter should be > 0, got {self.max_iter} instead.") + raise ValueError(f"max_iter should be > 0, got {self.max_iter} instead.") algorithm = self.algorithm - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): if algorithm == "elkan" and self.n_clusters == 1: - warnings.warn("algorithm='elkan' doesn't make sense for a single " - "cluster. Using 'full' instead.", RuntimeWarning) + warnings.warn( + "algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'full' instead.", + RuntimeWarning, + ) algorithm = "lloyd" if algorithm == "auto" or algorithm == "full": - warnings.warn("algorithm= {'auto','full'} is deprecated" - "Using 'lloyd' instead.", RuntimeWarning) + warnings.warn( + "algorithm= {'auto','full'} is deprecated" "Using 'lloyd' instead.", + RuntimeWarning, + ) algorithm = "lloyd" if self.n_clusters == 1 else "elkan" if algorithm not in ["lloyd", "full", "elkan"]: - raise ValueError("Algorithm must be 'auto','lloyd', 'full' or 'elkan'," - "got {}".format(str(algorithm))) + raise ValueError( + "Algorithm must be 'auto','lloyd', 'full' or 'elkan'," + "got {}".format(str(algorithm)) + ) else: if algorithm == "elkan" and self.n_clusters == 1: - warnings.warn("algorithm='elkan' doesn't make sense for a single " - "cluster. Using 'full' instead.", RuntimeWarning) + warnings.warn( + "algorithm='elkan' doesn't make sense for a single " + "cluster. Using 'full' instead.", + RuntimeWarning, + ) algorithm = "full" if algorithm == "auto": algorithm = "full" if self.n_clusters == 1 else "elkan" if algorithm not in ["full", "elkan"]: - raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got" - " {}".format(str(algorithm))) + raise ValueError( + "Algorithm must be 'auto', 'full' or 'elkan', got" + " {}".format(str(algorithm)) + ) X_len = _num_samples(X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.fit") - _dal_ready = _patching_status.and_conditions([ - (self.n_clusters <= X_len, - "The number of clusters is larger than the number of samples in X.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.n_clusters <= X_len, + "The number of clusters is larger than the number of samples in X.", + ) + ] + ) if _dal_ready and sample_weight is not None: if isinstance(sample_weight, numbers.Number): sample_weight = np.full(X_len, sample_weight, dtype=np.float64) else: sample_weight = np.asarray(sample_weight) - _dal_ready = _patching_status.and_conditions([ - (sample_weight.shape == (X_len,), - "Sample weights do not have the same length as X."), - (np.allclose(sample_weight, np.ones_like(sample_weight)), - "Sample weights are not ones.") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + sample_weight.shape == (X_len,), + "Sample weights do not have the same length as X.", + ), + ( + np.allclose(sample_weight, np.ones_like(sample_weight)), + "Sample weights are not ones.", + ), + ] + ) _patching_status.write_log() if _dal_ready: - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) self.n_features_in_ = X.shape[1] - self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ - _daal4py_k_means_fit( - X, self.n_clusters, self.max_iter, self.tol, init, self.n_init, - self.verbose, random_state) - if sklearn_check_version('1.1'): + ( + self.cluster_centers_, + self.labels_, + self.inertia_, + self.n_iter_, + ) = _daal4py_k_means_fit( + X, + self.n_clusters, + self.max_iter, + self.tol, + init, + self.n_init, + self.verbose, + random_state, + ) + if sklearn_check_version("1.1"): self._n_features_out = self.cluster_centers_.shape[0] else: super(KMeans, self).fit(X, y=y, sample_weight=sample_weight) @@ -395,15 +448,15 @@ def _daal4py_check_test_data(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, - accept_sparse='csr', - dtype=[np.float64, np.float32], - accept_large_sparse=False + X, accept_sparse="csr", dtype=[np.float64, np.float32], accept_large_sparse=False ) if self.n_features_in_ != X.shape[1]: raise ValueError( - (f'X has {X.shape[1]} features, ' - f'but Kmeans is expecting {self.n_features_in_} features as input')) + ( + f"X has {X.shape[1]} features, " + f"but Kmeans is expecting {self.n_features_in_} features as input" + ) + ) return X @@ -432,22 +485,22 @@ def _predict(self, X, sample_weight=None): X = _daal4py_check_test_data(self, X) - _patching_status = PatchingConditionsChain( - "sklearn.cluster.KMeans.predict") - _patching_status.and_conditions([ - (sample_weight is None, "Sample weights are not supported."), - (hasattr(X, '__array__'), "X does not have '__array__' attribute.") - ]) - _dal_ready = _patching_status.or_conditions([ - (sp.isspmatrix_csr(X), "X is not sparse.") - ]) + _patching_status = PatchingConditionsChain("sklearn.cluster.KMeans.predict") + _patching_status.and_conditions( + [ + (sample_weight is None, "Sample weights are not supported."), + (hasattr(X, "__array__"), "X does not have '__array__' attribute."), + ] + ) + _dal_ready = _patching_status.or_conditions( + [(sp.isspmatrix_csr(X), "X is not sparse.")] + ) _patching_status.write_log() if _dal_ready: - return _daal4py_k_means_predict( - X, self.n_clusters, self.cluster_centers_)[0] - if sklearn_check_version('1.2'): - if sklearn_check_version('1.3') and sample_weight is not None: + return _daal4py_k_means_predict(X, self.n_clusters, self.cluster_centers_)[0] + if sklearn_check_version("1.2"): + if sklearn_check_version("1.3") and sample_weight is not None: warnings.warn( "'sample_weight' was deprecated in version 1.3 and " "will be removed in 1.5.", @@ -456,30 +509,30 @@ def _predict(self, X, sample_weight=None): return _labels_inertia(X, sample_weight, self.cluster_centers_)[0] else: x_squared_norms = row_norms(X, squared=True) - return _labels_inertia(X, sample_weight, x_squared_norms, - self.cluster_centers_)[0] + return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[ + 0 + ] class KMeans(KMeans_original): __doc__ = KMeans_original.__doc__ - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **KMeans_original._parameter_constraints} + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**KMeans_original._parameter_constraints} @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', - n_init='auto' if sklearn_check_version('1.4') else 'warn', + init="k-means++", + n_init="auto" if sklearn_check_version("1.4") else "warn", max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='lloyd', + algorithm="lloyd", ): super(KMeans, self).__init__( n_clusters=n_clusters, @@ -492,20 +545,22 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) - elif sklearn_check_version('1.0'): + + elif sklearn_check_version("1.0"): + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='auto', + algorithm="auto", ): super(KMeans, self).__init__( n_clusters=n_clusters, @@ -518,22 +573,24 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) + else: + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, - precompute_distances='deprecated', + precompute_distances="deprecated", verbose=0, random_state=None, copy_x=True, - n_jobs='deprecated', - algorithm='auto', + n_jobs="deprecated", + algorithm="auto", ): super(KMeans, self).__init__( n_clusters=n_clusters, diff --git a/daal4py/sklearn/cluster/dbscan.py b/daal4py/sklearn/cluster/dbscan.py index 274be166d3..e868a91d63 100644 --- a/daal4py/sklearn/cluster/dbscan.py +++ b/daal4py/sklearn/cluster/dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._dbscan import * diff --git a/daal4py/sklearn/cluster/k_means.py b/daal4py/sklearn/cluster/k_means.py index caa4a69abe..e57c309d7d 100755 --- a/daal4py/sklearn/cluster/k_means.py +++ b/daal4py/sklearn/cluster/k_means.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import sklearn_check_version -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from ._k_means_0_23 import * else: from ._k_means_0_22 import * diff --git a/daal4py/sklearn/cluster/tests/test_dbscan.py b/daal4py/sklearn/cluster/tests/test_dbscan.py index 0b2f88f254..1640e0e746 100755 --- a/daal4py/sklearn/cluster/tests/test_dbscan.py +++ b/daal4py/sklearn/cluster/tests/test_dbscan.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,27 +12,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np import pytest from sklearn.cluster import DBSCAN as DBSCAN_SKLEARN + from daal4py.sklearn.cluster import DBSCAN as DBSCAN_DAAL -METRIC = ('euclidean', ) +METRIC = ("euclidean",) USE_WEIGHTS = (True, False) -def generate_data(low: int, high: int, samples_number: int, - sample_dimension: tuple) -> tuple: +def generate_data( + low: int, high: int, samples_number: int, sample_dimension: tuple +) -> tuple: generator = np.random.RandomState() table_size = (samples_number, sample_dimension) - return generator.uniform( - low=low, high=high, size=table_size), generator.uniform(size=samples_number) + return generator.uniform(low=low, high=high, size=table_size), generator.uniform( + size=samples_number + ) -def check_labels_equals(left_labels: np.ndarray, - right_labels: np.ndarray) -> bool: +def check_labels_equals(left_labels: np.ndarray, right_labels: np.ndarray) -> bool: if left_labels.shape != right_labels.shape: raise Exception("Shapes not equals") if len(left_labels.shape) != 1: @@ -42,42 +44,49 @@ def check_labels_equals(left_labels: np.ndarray, dict_checker = {} for index_sample in range(left_labels.shape[0]): if left_labels[index_sample] not in dict_checker: - dict_checker[left_labels[index_sample] - ] = right_labels[index_sample] + dict_checker[left_labels[index_sample]] = right_labels[index_sample] elif dict_checker[left_labels[index_sample]] != right_labels[index_sample]: raise Exception("Wrong clustering") return True -def _test_dbscan_big_data_numpy_gen(eps: float, min_samples: int, metric: str, - use_weights: bool, low=-100.0, high=100.0, - samples_number=1000, sample_dimension=4): +def _test_dbscan_big_data_numpy_gen( + eps: float, + min_samples: int, + metric: str, + use_weights: bool, + low=-100.0, + high=100.0, + samples_number=1000, + sample_dimension=4, +): data, weights = generate_data( - low=low, high=high, samples_number=samples_number, - sample_dimension=sample_dimension) + low=low, + high=high, + samples_number=samples_number, + sample_dimension=sample_dimension, + ) if use_weights is False: weights = None initialized_daal_dbscan = DBSCAN_DAAL( - eps=eps, min_samples=min_samples, metric=metric).fit( - X=data, sample_weight=weights) + eps=eps, min_samples=min_samples, metric=metric + ).fit(X=data, sample_weight=weights) initialized_sklearn_dbscan = DBSCAN_SKLEARN( - metric=metric, eps=eps, min_samples=min_samples).fit( - X=data, sample_weight=weights) + metric=metric, eps=eps, min_samples=min_samples + ).fit(X=data, sample_weight=weights) check_labels_equals( - initialized_daal_dbscan.labels_, - initialized_sklearn_dbscan.labels_) + initialized_daal_dbscan.labels_, initialized_sklearn_dbscan.labels_ + ) -@pytest.mark.parametrize('metric', METRIC) -@pytest.mark.parametrize('use_weights', USE_WEIGHTS) +@pytest.mark.parametrize("metric", METRIC) +@pytest.mark.parametrize("use_weights", USE_WEIGHTS) def test_dbscan_big_data_numpy_gen(metric, use_weights: bool): eps = 35.0 min_samples = 6 _test_dbscan_big_data_numpy_gen( - eps=eps, - min_samples=min_samples, - metric=metric, - use_weights=use_weights) + eps=eps, min_samples=min_samples, metric=metric, use_weights=use_weights + ) def _test_across_grid_parameter_numpy_gen(metric, use_weights: bool): @@ -88,17 +97,13 @@ def _test_across_grid_parameter_numpy_gen(metric, use_weights: bool): min_samples_end = 15 min_samples_step = 1 for eps in np.arange(eps_begin, eps_end, eps_step): - for min_samples in range( - min_samples_begin, min_samples_end, min_samples_step): + for min_samples in range(min_samples_begin, min_samples_end, min_samples_step): _test_dbscan_big_data_numpy_gen( - eps=eps, - min_samples=min_samples, - metric=metric, - use_weights=use_weights) + eps=eps, min_samples=min_samples, metric=metric, use_weights=use_weights + ) -@pytest.mark.parametrize('metric', METRIC) -@pytest.mark.parametrize('use_weights', USE_WEIGHTS) +@pytest.mark.parametrize("metric", METRIC) +@pytest.mark.parametrize("use_weights", USE_WEIGHTS) def test_across_grid_parameter_numpy_gen(metric, use_weights: bool): - _test_across_grid_parameter_numpy_gen( - metric=metric, use_weights=use_weights) + _test_across_grid_parameter_numpy_gen(metric=metric, use_weights=use_weights) diff --git a/daal4py/sklearn/decomposition/__init__.py b/daal4py/sklearn/decomposition/__init__.py index 404a5cff95..a58befffe2 100644 --- a/daal4py/sklearn/decomposition/__init__.py +++ b/daal4py/sklearn/decomposition/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/daal4py/sklearn/decomposition/_pca.py b/daal4py/sklearn/decomposition/_pca.py index e04f23d664..06792795f5 100644 --- a/daal4py/sklearn/decomposition/_pca.py +++ b/daal4py/sklearn/decomposition/_pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,30 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers from math import sqrt -from scipy.sparse import issparse +import numpy as np +from scipy.sparse import issparse from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.validation import check_is_fitted import daal4py -from .._utils import ( - getFPType, sklearn_check_version, PatchingConditionsChain) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version -if sklearn_check_version('0.22'): +if sklearn_check_version("0.22"): from sklearn.decomposition._pca import PCA as PCA_original else: from sklearn.decomposition.pca import PCA as PCA_original -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from sklearn.decomposition._pca import _infer_dimension -elif sklearn_check_version('0.22'): +elif sklearn_check_version("0.22"): from sklearn.decomposition._pca import _infer_dimension_ else: from sklearn.decomposition.pca import _infer_dimension_ @@ -49,12 +49,12 @@ def __init__( n_components=None, copy=True, whiten=False, - svd_solver='auto', + svd_solver="auto", tol=0.0, - iterated_power='auto', + iterated_power="auto", n_oversamples=10, power_iteration_normalizer="auto", - random_state=None + random_state=None, ): self.n_components = n_components self.copy = copy @@ -67,27 +67,30 @@ def __init__( self.random_state = random_state def _validate_n_components(self, n_components, n_samples, n_features): - if n_components == 'mle': + if n_components == "mle": if n_samples < n_features: - raise ValueError("n_components='mle' is only supported " - "if n_samples >= n_features") + raise ValueError( + "n_components='mle' is only supported " "if n_samples >= n_features" + ) elif not 0 <= n_components <= min(n_samples, n_features): - raise ValueError("n_components=%r must be between 0 and " - "min(n_samples, n_features)=%r with " - "svd_solver='full'" - % (n_components, min(n_samples, n_features))) + raise ValueError( + "n_components=%r must be between 0 and " + "min(n_samples, n_features)=%r with " + "svd_solver='full'" % (n_components, min(n_samples, n_features)) + ) elif n_components >= 1: if not isinstance(n_components, numbers.Integral): - raise ValueError("n_components=%r must be of type int " - "when greater than or equal to 1, " - "was of type=%r" - % (n_components, type(n_components))) + raise ValueError( + "n_components=%r must be of type int " + "when greater than or equal to 1, " + "was of type=%r" % (n_components, type(n_components)) + ) def _fit_full_daal4py(self, X, n_components): n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) - if n_components == 'mle': + if n_components == "mle": daal_n_components = n_features elif n_components < 1: daal_n_components = n_sf_min @@ -97,7 +100,8 @@ def _fit_full_daal4py(self, X, n_components): fpType = getFPType(X) covariance_algo = daal4py.covariance( - fptype=fpType, outputMatrixType='covarianceMatrix') + fptype=fpType, outputMatrixType="covarianceMatrix" + ) covariance_res = covariance_algo.compute(X) self.mean_ = covariance_res.mean.ravel() @@ -106,10 +110,10 @@ def _fit_full_daal4py(self, X, n_components): pca_alg = daal4py.pca( fptype=fpType, - method='correlationDense', - resultsToCompute='eigenvalue', + method="correlationDense", + resultsToCompute="eigenvalue", isDeterministic=True, - nComponents=daal_n_components + nComponents=daal_n_components, ) pca_res = pca_alg.compute(X, covariance) @@ -118,16 +122,16 @@ def _fit_full_daal4py(self, X, n_components): tot_var = explained_variance_.sum() explained_variance_ratio_ = explained_variance_ / tot_var - if n_components == 'mle': - if sklearn_check_version('0.23'): + if n_components == "mle": + if sklearn_check_version("0.23"): n_components = _infer_dimension(explained_variance_, n_samples) else: - n_components = \ - _infer_dimension_(explained_variance_, n_samples, n_features) + n_components = _infer_dimension_( + explained_variance_, n_samples, n_features + ) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(explained_variance_ratio_) - n_components = np.searchsorted(ratio_cumsum, n_components, - side='right') + 1 + n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1 if n_components < n_sf_min: if explained_variance_.shape[0] == n_sf_min: @@ -137,9 +141,9 @@ def _fit_full_daal4py(self, X, n_components): resid_var_ -= explained_variance_[:n_components].sum() self.noise_variance_ = resid_var_ / (n_sf_min - n_components) else: - self.noise_variance_ = 0. + self.noise_variance_ = 0.0 - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self.n_samples_, self.n_features_in_ = n_samples, n_features else: self.n_samples_, self.n_features_ = n_samples, n_features @@ -159,23 +163,23 @@ def _fit_full(self, X, n_components): V = self.components_ S = self.singular_values_ - if n_components == 'mle': - if sklearn_check_version('0.23'): + if n_components == "mle": + if sklearn_check_version("0.23"): n_components = _infer_dimension(self.explained_variance_, n_samples) else: - n_components = \ - _infer_dimension_(self.explained_variance_, n_samples, n_features) + n_components = _infer_dimension_( + self.explained_variance_, n_samples, n_features + ) elif 0 < n_components < 1.0: ratio_cumsum = stable_cumsum(self.explained_variance_ratio_) - n_components = np.searchsorted(ratio_cumsum, n_components, - side='right') + 1 + n_components = np.searchsorted(ratio_cumsum, n_components, side="right") + 1 if n_components < min(n_features, n_samples): self.noise_variance_ = self.explained_variance_[n_components:].mean() else: - self.noise_variance_ = 0. + self.noise_variance_ = 0.0 - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self.n_samples_, self.n_features_in_ = n_samples, n_features else: self.n_samples_, self.n_features_ = n_samples, n_features @@ -189,17 +193,20 @@ def _fit_full(self, X, n_components): def _fit(self, X): if issparse(X): - raise TypeError('PCA does not support sparse input. See ' - 'TruncatedSVD for a possible alternative.') - - if sklearn_check_version('0.23'): - X = self._validate_data(X, dtype=[np.float64, np.float32], - ensure_2d=True, copy=False) + raise TypeError( + "PCA does not support sparse input. See " + "TruncatedSVD for a possible alternative." + ) + + if sklearn_check_version("0.23"): + X = self._validate_data( + X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False + ) else: X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False) if self.n_components is None: - if self.svd_solver != 'arpack': + if self.svd_solver != "arpack": n_components = min(X.shape) else: n_components = min(X.shape) - 1 @@ -209,8 +216,8 @@ def _fit(self, X): self._fit_svd_solver = self.svd_solver shape_good_for_daal = X.shape[1] / X.shape[0] < 2 - if self._fit_svd_solver == 'auto': - if sklearn_check_version('1.1'): + if self._fit_svd_solver == "auto": + if sklearn_check_version("1.1"): # Small problem or n_components == 'mle', just call full PCA if max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" @@ -220,8 +227,8 @@ def _fit(self, X): else: self._fit_svd_solver = "full" else: - if n_components == 'mle': - self._fit_svd_solver = 'full' + if n_components == "mle": + self._fit_svd_solver = "full" else: n, p, k = X.shape[0], X.shape[1], n_components # These coefficients are result of training of Logistic Regression @@ -230,56 +237,68 @@ def _fit(self, X): # X is a dataset with npk, np^2, and n^2 columns. # And y is speedup of patched scikit-learn's # full PCA against stock scikit-learn's randomized PCA. - regression_coefs = np.array([ - [9.779873e-11, n * p * k], - [-1.122062e-11, n * p * p], - [1.127905e-09, n ** 2], - ]) - - if n_components >= 1 and np.dot( - regression_coefs[:, 0], regression_coefs[:, 1]) <= 0: - self._fit_svd_solver = 'randomized' + regression_coefs = np.array( + [ + [9.779873e-11, n * p * k], + [-1.122062e-11, n * p * p], + [1.127905e-09, n**2], + ] + ) + + if ( + n_components >= 1 + and np.dot(regression_coefs[:, 0], regression_coefs[:, 1]) <= 0 + ): + self._fit_svd_solver = "randomized" else: - self._fit_svd_solver = 'full' + self._fit_svd_solver = "full" - if not shape_good_for_daal or self._fit_svd_solver != 'full': - if sklearn_check_version('0.23'): + if not shape_good_for_daal or self._fit_svd_solver != "full": + if sklearn_check_version("0.23"): X = self._validate_data(X, copy=self.copy) else: X = check_array(X, copy=self.copy) - _patching_status = PatchingConditionsChain( - "sklearn.decomposition.PCA.fit") - _dal_ready = _patching_status.and_conditions([ - (self._fit_svd_solver == 'full', - f"'{self._fit_svd_solver}' SVD solver is not supported. " - "Only 'full' solver is supported.") - ]) + _patching_status = PatchingConditionsChain("sklearn.decomposition.PCA.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self._fit_svd_solver == "full", + f"'{self._fit_svd_solver}' SVD solver is not supported. " + "Only 'full' solver is supported.", + ) + ] + ) if _dal_ready: - _dal_ready = _patching_status.and_conditions([ - (shape_good_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features / number of samples >= 2") - ]) + _dal_ready = _patching_status.and_conditions( + [ + ( + shape_good_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features / number of samples >= 2", + ) + ] + ) if _dal_ready: result = self._fit_full(X, n_components) else: result = PCA_original._fit_full(self, X, n_components) - elif self._fit_svd_solver in ['arpack', 'randomized']: + elif self._fit_svd_solver in ["arpack", "randomized"]: result = self._fit_truncated(X, n_components, self._fit_svd_solver) else: - raise ValueError("Unrecognized svd_solver='{0}'" - "".format(self._fit_svd_solver)) + raise ValueError( + "Unrecognized svd_solver='{0}'" "".format(self._fit_svd_solver) + ) _patching_status.write_log() return result def _transform_daal4py(self, X, whiten=False, scale_eigenvalues=True, check_X=True): - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, ['mean_', 'components_'], all_or_any=all) + check_is_fitted(self, ["mean_", "components_"], all_or_any=all) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -288,30 +307,36 @@ def _transform_daal4py(self, X, whiten=False, scale_eigenvalues=True, check_X=Tr tr_data = dict() if self.mean_ is not None: - tr_data['mean'] = self.mean_.reshape((1, -1)) + tr_data["mean"] = self.mean_.reshape((1, -1)) if whiten: if scale_eigenvalues: - tr_data['eigenvalue'] = \ - (self.n_samples_ - 1) * self.explained_variance_.reshape((1, -1)) + tr_data["eigenvalue"] = ( + self.n_samples_ - 1 + ) * self.explained_variance_.reshape((1, -1)) else: - tr_data['eigenvalue'] = self.explained_variance_.reshape((1, -1)) + tr_data["eigenvalue"] = self.explained_variance_.reshape((1, -1)) elif scale_eigenvalues: - tr_data['eigenvalue'] = np.full( + tr_data["eigenvalue"] = np.full( (1, self.explained_variance_.shape[0]), - self.n_samples_ - 1.0, dtype=X.dtype) + self.n_samples_ - 1.0, + dtype=X.dtype, + ) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): expected_n_features = self.n_features_in_ else: expected_n_features = self.n_features_ if X.shape[1] != expected_n_features: raise ValueError( - (f'X has {X.shape[1]} features, ' - f'but PCA is expecting {expected_n_features} features as input')) - - tr_res = daal4py.pca_transform( - fptype=fpType - ).compute(X, self.components_, tr_data) + ( + f"X has {X.shape[1]} features, " + f"but PCA is expecting {expected_n_features} features as input" + ) + ) + + tr_res = daal4py.pca_transform(fptype=fpType).compute( + X, self.components_, tr_data + ) return tr_res.transformedData @@ -335,16 +360,16 @@ def transform(self, X): Projection of X in the first principal components, where `n_samples` is the number of samples and `n_components` is the number of the components. """ - _patching_status = PatchingConditionsChain( - "sklearn.decomposition.PCA.transform") - _dal_ready = _patching_status.and_conditions([ - (self.n_components_ > 0, "Number of components <= 0.") - ]) + _patching_status = PatchingConditionsChain("sklearn.decomposition.PCA.transform") + _dal_ready = _patching_status.and_conditions( + [(self.n_components_ > 0, "Number of components <= 0.")] + ) _patching_status.write_log() if _dal_ready: - return self._transform_daal4py(X, whiten=self.whiten, - check_X=True, scale_eigenvalues=False) + return self._transform_daal4py( + X, whiten=self.whiten, check_X=True, scale_eigenvalues=False + ) return PCA_original.transform(self, X) @support_usm_ndarray() @@ -372,32 +397,34 @@ def fit_transform(self, X, y=None): C-ordered array, use 'np.ascontiguousarray'. """ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self._validate_params() U, S, Vt = self._fit(X) _patching_status = PatchingConditionsChain( - "sklearn.decomposition.PCA.fit_transform") - _dal_ready = _patching_status.and_conditions([ - (U is None, "Stock fitting was used.") - ]) + "sklearn.decomposition.PCA.fit_transform" + ) + _dal_ready = _patching_status.and_conditions( + [(U is None, "Stock fitting was used.")] + ) if _dal_ready: - _dal_ready = _patching_status.and_conditions([ - (self.n_components_ > 0, "Number of components <= 0.") - ]) + _dal_ready = _patching_status.and_conditions( + [(self.n_components_ > 0, "Number of components <= 0.")] + ) if _dal_ready: result = self._transform_daal4py( - X, whiten=self.whiten, check_X=False, scale_eigenvalues=False) + X, whiten=self.whiten, check_X=False, scale_eigenvalues=False + ) else: result = np.empty((self.n_samples_, 0), dtype=X.dtype) else: - U = U[:, :self.n_components_] + U = U[:, : self.n_components_] if self.whiten: U *= sqrt(X.shape[0] - 1) else: - U *= S[:self.n_components_] + U *= S[: self.n_components_] result = U diff --git a/daal4py/sklearn/ensemble/AdaBoostClassifier.py b/daal4py/sklearn/ensemble/AdaBoostClassifier.py index 320c15dc3e..6871ba26ee 100644 --- a/daal4py/sklearn/ensemble/AdaBoostClassifier.py +++ b/daal4py/sklearn/ensemble/AdaBoostClassifier.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,20 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py AdaBoost (Adaptive Boosting) scikit-learn-compatible estimator class -import numpy as np import numbers -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + +import numpy as np +from sklearn import __version__ as sklearn_version from sklearn import preprocessing +from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y + import daal4py as d4p + from .._utils import getFPType -from sklearn import __version__ as sklearn_version try: from packaging.version import Version except ImportError: @@ -33,13 +36,15 @@ class AdaBoostClassifier(BaseEstimator, ClassifierMixin): - def __init__(self, - split_criterion='gini', - max_tree_depth=1, - min_observations_in_leaf_node=1, - max_iterations=100, - learning_rate=1.0, - accuracy_threshold=0.01): + def __init__( + self, + split_criterion="gini", + max_tree_depth=1, + min_observations_in_leaf_node=1, + max_iterations=100, + learning_rate=1.0, + accuracy_threshold=0.01, + ): self.split_criterion = split_criterion self.max_tree_depth = max_tree_depth self.min_observations_in_leaf_node = min_observations_in_leaf_node @@ -48,30 +53,44 @@ def __init__(self, self.accuracy_threshold = accuracy_threshold def fit(self, X, y): - if self.split_criterion not in ('gini', 'infoGain'): - raise ValueError('Parameter "split_criterion" must be ' - '"gini" or "infoGain".') - if not isinstance(self.max_tree_depth, numbers.Integral) or \ - self.max_tree_depth < 0: - raise ValueError('Parameter "max_tree_depth" must be ' - 'positive integer value or zero.') - if not isinstance(self.min_observations_in_leaf_node, numbers.Integral) or \ - self.min_observations_in_leaf_node <= 0: - raise ValueError('Parameter "min_observations_in_leaf_node" must be ' - 'non-zero positive integer value.') - if not isinstance(self.max_iterations, numbers.Integral) or \ - self.max_iterations <= 0: - raise ValueError('Parameter "max_iterations" must be ' - 'non-zero positive integer value.') + if self.split_criterion not in ("gini", "infoGain"): + raise ValueError( + 'Parameter "split_criterion" must be ' '"gini" or "infoGain".' + ) + if ( + not isinstance(self.max_tree_depth, numbers.Integral) + or self.max_tree_depth < 0 + ): + raise ValueError( + 'Parameter "max_tree_depth" must be ' "positive integer value or zero." + ) + if ( + not isinstance(self.min_observations_in_leaf_node, numbers.Integral) + or self.min_observations_in_leaf_node <= 0 + ): + raise ValueError( + 'Parameter "min_observations_in_leaf_node" must be ' + "non-zero positive integer value." + ) + if ( + not isinstance(self.max_iterations, numbers.Integral) + or self.max_iterations <= 0 + ): + raise ValueError( + 'Parameter "max_iterations" must be ' "non-zero positive integer value." + ) if self.learning_rate <= 0: - raise ValueError('Parameter "learning_rate" must be ' - 'non-zero positive value.') + raise ValueError( + 'Parameter "learning_rate" must be ' "non-zero positive value." + ) # it is not clear why it is so but we will get error from # Intel(R) oneAPI Data Analytics # Library otherwise if self.accuracy_threshold < 0 and self.accuracy_threshold >= 1: - raise ValueError('Parameter "accuracy_threshold" must be ' - 'more or equal to 0 and less than 1.') + raise ValueError( + 'Parameter "accuracy_threshold" must be ' + "more or equal to 0 and less than 1." + ) # Check that X and y have correct shape X, y = check_X_y(X, y, y_numeric=False, dtype=[np.single, np.double]) @@ -107,11 +126,12 @@ def fit(self, X, y): maxTreeDepth=self.max_tree_depth + 1, minObservationsInLeafNodes=self.min_observations_in_leaf_node, splitCriterion=self.split_criterion, - pruning='none') + pruning="none", + ) pr = d4p.decision_tree_classification_prediction( - fptype=fptype, - nClasses=self.n_classes_) + fptype=fptype, nClasses=self.n_classes_ + ) train_algo = d4p.adaboost_training( fptype=fptype, @@ -120,7 +140,8 @@ def fit(self, X, y): weakLearnerPrediction=pr, maxIterations=self.max_iterations, learningRate=self.learning_rate, - accuracyThreshold=self.accuracy_threshold) + accuracyThreshold=self.accuracy_threshold, + ) train_result = train_algo.compute(X, y_) @@ -135,33 +156,36 @@ def predict(self, X): if Version(sklearn_version) >= Version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, ['n_features_in_', 'n_classes_']) + check_is_fitted(self, ["n_features_in_", "n_classes_"]) # Input validation X = check_array(X, dtype=[np.single, np.double]) if X.shape[1] != self.n_features_in_: - raise ValueError('Shape of input is different from what was seen in `fit`') + raise ValueError("Shape of input is different from what was seen in `fit`") # Trivial case if self.n_classes_ == 1: return np.full(X.shape[0], self.classes_[0]) - if not hasattr(self, 'daal_model_'): - raise ValueError(( - "The class {} instance does not have 'daal_model_' attribute set. " - "Call 'fit' with appropriate arguments before using this method.").format( - type(self).__name__)) + if not hasattr(self, "daal_model_"): + raise ValueError( + ( + "The class {} instance does not have 'daal_model_' attribute set. " + "Call 'fit' with appropriate arguments before using this method." + ).format(type(self).__name__) + ) # Define type of data fptype = getFPType(X) - pr = d4p.decision_tree_classification_prediction(fptype=fptype, - nClasses=self.n_classes_) + pr = d4p.decision_tree_classification_prediction( + fptype=fptype, nClasses=self.n_classes_ + ) # Prediction - predict_algo = d4p.adaboost_prediction(fptype=fptype, - nClasses=self.n_classes_, - weakLearnerPrediction=pr) + predict_algo = d4p.adaboost_prediction( + fptype=fptype, nClasses=self.n_classes_, weakLearnerPrediction=pr + ) predict_result = predict_algo.compute(X, self.daal_model_) prediction = predict_result.prediction diff --git a/daal4py/sklearn/ensemble/GBTDAAL.py b/daal4py/sklearn/ensemble/GBTDAAL.py index 1ea795b2cd..be97722f18 100644 --- a/daal4py/sklearn/ensemble/GBTDAAL.py +++ b/daal4py/sklearn/ensemble/GBTDAAL.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,36 +12,41 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py GBT scikit-learn-compatible estimator class -import numpy as np import numbers -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted + +import numpy as np from sklearn import preprocessing -from sklearn.utils.multiclass import check_classification_targets +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.utils import check_random_state +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y + import daal4py as d4p + from .._utils import getFPType class GBTDAALBase(BaseEstimator, d4p.mb.GBTDAALBaseModel): - def __init__(self, - split_method='inexact', - max_iterations=50, - max_tree_depth=6, - shrinkage=0.3, - min_split_loss=0, - reg_lambda=1, - observations_per_tree_fraction=1, - features_per_node=0, - min_observations_in_leaf_node=5, - memory_saving_mode=False, - max_bins=256, - min_bin_size=5, - random_state=None): + def __init__( + self, + split_method="inexact", + max_iterations=50, + max_tree_depth=6, + shrinkage=0.3, + min_split_loss=0, + reg_lambda=1, + observations_per_tree_fraction=1, + features_per_node=0, + min_observations_in_leaf_node=5, + memory_saving_mode=False, + max_bins=256, + min_bin_size=5, + random_state=None, + ): self.split_method = split_method self.max_iterations = max_iterations self.max_tree_depth = max_tree_depth @@ -57,49 +62,65 @@ def __init__(self, self.random_state = random_state def _check_params(self): - if self.split_method not in ('inexact', 'exact'): - raise ValueError('Parameter "split_method" must be ' - '"inexact" or "exact".') - if not isinstance(self.max_iterations, numbers.Integral) or \ - self.max_iterations <= 0: - raise ValueError('Parameter "max_iterations" must be ' - 'non-zero positive integer value.') - if not isinstance(self.max_tree_depth, numbers.Integral) or \ - self.max_tree_depth < 0: - raise ValueError('Parameter "max_tree_depth" must be ' - 'positive integer value or zero.') + if self.split_method not in ("inexact", "exact"): + raise ValueError('Parameter "split_method" must be ' '"inexact" or "exact".') + if ( + not isinstance(self.max_iterations, numbers.Integral) + or self.max_iterations <= 0 + ): + raise ValueError( + 'Parameter "max_iterations" must be ' "non-zero positive integer value." + ) + if ( + not isinstance(self.max_tree_depth, numbers.Integral) + or self.max_tree_depth < 0 + ): + raise ValueError( + 'Parameter "max_tree_depth" must be ' "positive integer value or zero." + ) if self.shrinkage < 0 or self.shrinkage >= 1: - raise ValueError('Parameter "shrinkage" must be ' - 'more or equal to 0 and less than 1.') + raise ValueError( + 'Parameter "shrinkage" must be ' "more or equal to 0 and less than 1." + ) if self.min_split_loss < 0: - raise ValueError('Parameter "min_split_loss" must be ' - 'more or equal to zero.') + raise ValueError( + 'Parameter "min_split_loss" must be ' "more or equal to zero." + ) if self.reg_lambda < 0: - raise ValueError('Parameter "reg_lambda" must be ' - 'more or equal to zero.') - if self.observations_per_tree_fraction <= 0 or \ - self.observations_per_tree_fraction > 1: - raise ValueError('Parameter "observations_per_tree_fraction" must be ' - 'more than 0 and less or equal to 1.') - if not isinstance(self.features_per_node, numbers.Integral) or \ - self.features_per_node < 0: - raise ValueError('Parameter "features_per_node" must be ' - 'positive integer value or zero.') - if not isinstance(self.min_observations_in_leaf_node, numbers.Integral) or \ - self.min_observations_in_leaf_node <= 0: - raise ValueError('Parameter "min_observations_in_leaf_node" must be ' - 'non-zero positive integer value.') + raise ValueError('Parameter "reg_lambda" must be ' "more or equal to zero.") + if ( + self.observations_per_tree_fraction <= 0 + or self.observations_per_tree_fraction > 1 + ): + raise ValueError( + 'Parameter "observations_per_tree_fraction" must be ' + "more than 0 and less or equal to 1." + ) + if ( + not isinstance(self.features_per_node, numbers.Integral) + or self.features_per_node < 0 + ): + raise ValueError( + 'Parameter "features_per_node" must be ' "positive integer value or zero." + ) + if ( + not isinstance(self.min_observations_in_leaf_node, numbers.Integral) + or self.min_observations_in_leaf_node <= 0 + ): + raise ValueError( + 'Parameter "min_observations_in_leaf_node" must be ' + "non-zero positive integer value." + ) if not (isinstance(self.memory_saving_mode, bool)): - raise ValueError('Parameter "memory_saving_mode" must be ' - 'boolean value.') - if not isinstance(self.max_bins, numbers.Integral) or \ - self.max_bins <= 0: - raise ValueError('Parameter "max_bins" must be ' - 'non-zero positive integer value.') - if not isinstance(self.min_bin_size, numbers.Integral) or \ - self.min_bin_size <= 0: - raise ValueError('Parameter "min_bin_size" must be ' - 'non-zero positive integer value.') + raise ValueError('Parameter "memory_saving_mode" must be ' "boolean value.") + if not isinstance(self.max_bins, numbers.Integral) or self.max_bins <= 0: + raise ValueError( + 'Parameter "max_bins" must be ' "non-zero positive integer value." + ) + if not isinstance(self.min_bin_size, numbers.Integral) or self.min_bin_size <= 0: + raise ValueError( + 'Parameter "min_bin_size" must be ' "non-zero positive integer value." + ) allow_nan_ = False @@ -139,7 +160,7 @@ def fit(self, X, y): # Get random seed rs_ = check_random_state(self.random_state) - seed_ = rs_.randint(0, np.iinfo('i').max) + seed_ = rs_.randint(0, np.iinfo("i").max) # Define type of data fptype = getFPType(X) @@ -160,7 +181,8 @@ def fit(self, X, y): memorySavingMode=self.memory_saving_mode, maxBins=self.max_bins, minBinSize=self.min_bin_size, - engine=d4p.engines_mcg59(seed=seed_)) + engine=d4p.engines_mcg59(seed=seed_), + ) train_result = train_algo.compute(X, y_) # Store the model @@ -174,10 +196,10 @@ def _predict(self, X, resultsToEvaluate): if not self.allow_nan_: X = check_array(X, dtype=[np.single, np.double]) else: - X = check_array(X, dtype=[np.single, np.double], force_all_finite='allow-nan') + X = check_array(X, dtype=[np.single, np.double], force_all_finite="allow-nan") # Check is fit had been called - check_is_fitted(self, ['n_features_in_', 'n_classes_']) + check_is_fitted(self, ["n_features_in_", "n_classes_"]) # Trivial case if self.n_classes_ == 1: @@ -234,7 +256,7 @@ def fit(self, X, y): # Get random seed rs_ = check_random_state(self.random_state) - seed_ = rs_.randint(0, np.iinfo('i').max) + seed_ = rs_.randint(0, np.iinfo("i").max) # Define type of data fptype = getFPType(X) @@ -254,7 +276,8 @@ def fit(self, X, y): memorySavingMode=self.memory_saving_mode, maxBins=self.max_bins, minBinSize=self.min_bin_size, - engine=d4p.engines_mcg59(seed=seed_)) + engine=d4p.engines_mcg59(seed=seed_), + ) train_result = train_algo.compute(X, y_) # Store the model @@ -268,10 +291,10 @@ def predict(self, X): if not self.allow_nan_: X = check_array(X, dtype=[np.single, np.double]) else: - X = check_array(X, dtype=[np.single, np.double], force_all_finite='allow-nan') + X = check_array(X, dtype=[np.single, np.double], force_all_finite="allow-nan") # Check is fit had been called - check_is_fitted(self, ['n_features_in_']) + check_is_fitted(self, ["n_features_in_"]) fptype = getFPType(X) return self._predict_regression(X, fptype) diff --git a/daal4py/sklearn/ensemble/__init__.py b/daal4py/sklearn/ensemble/__init__.py index 15e97b423c..17e0d148d3 100644 --- a/daal4py/sklearn/ensemble/__init__.py +++ b/daal4py/sklearn/ensemble/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,11 +13,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ._forest import (RandomForestClassifier, RandomForestRegressor) -from .GBTDAAL import (GBTDAALClassifier, GBTDAALRegressor) +from ._forest import RandomForestClassifier, RandomForestRegressor from .AdaBoostClassifier import AdaBoostClassifier +from .GBTDAAL import GBTDAALClassifier, GBTDAALRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor', - 'GBTDAALClassifier', 'GBTDAALRegressor', 'AdaBoostClassifier'] +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "GBTDAALClassifier", + "GBTDAALRegressor", + "AdaBoostClassifier", +] diff --git a/daal4py/sklearn/ensemble/_forest.py b/daal4py/sklearn/ensemble/_forest.py index 031c23b050..2c08aa02f3 100755 --- a/daal4py/sklearn/ensemble/_forest.py +++ b/daal4py/sklearn/ensemble/_forest.py @@ -37,10 +37,10 @@ import daal4py from daal4py.sklearn._utils import ( PatchingConditionsChain, + check_tree_nodes, daal_check_version, - sklearn_check_version, getFPType, - check_tree_nodes, + sklearn_check_version, ) from .._device_offload import support_usm_ndarray diff --git a/daal4py/sklearn/linear_model/__init__.py b/daal4py/sklearn/linear_model/__init__.py index 4bfd932017..463e164575 100755 --- a/daal4py/sklearn/linear_model/__init__.py +++ b/daal4py/sklearn/linear_model/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,15 +13,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +from .coordinate_descent import ElasticNet, Lasso from .linear import LinearRegression -from .logistic_path import logistic_regression_path, LogisticRegression +from .logistic_path import LogisticRegression, logistic_regression_path from .ridge import Ridge -from .coordinate_descent import ElasticNet, Lasso -__all__ = ['Ridge', 'LinearRegression', - 'LogisticRegression', - 'logistic_regression_path', - 'ElasticNet', - 'Lasso'] +__all__ = [ + "Ridge", + "LinearRegression", + "LogisticRegression", + "logistic_regression_path", + "ElasticNet", + "Lasso", +] diff --git a/daal4py/sklearn/linear_model/_coordinate_descent.py b/daal4py/sklearn/linear_model/_coordinate_descent.py index a02f966ed6..fde5b25c27 100755 --- a/daal4py/sklearn/linear_model/_coordinate_descent.py +++ b/daal4py/sklearn/linear_model/_coordinate_descent.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,26 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers -import daal4py + +import numpy as np from scipy import sparse as sp -from sklearn.utils import check_array, check_X_y from sklearn.linear_model._coordinate_descent import ElasticNet as ElasticNet_original from sklearn.linear_model._coordinate_descent import Lasso as Lasso_original +from sklearn.utils import check_array, check_X_y + +import daal4py from daal4py.sklearn._utils import ( - make2d, getFPType, get_patch_message, sklearn_check_version, PatchingConditionsChain) -if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + PatchingConditionsChain, + get_patch_message, + getFPType, + make2d, + sklearn_check_version, +) + +if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar import logging # only for compliance with Sklearn import warnings + from sklearn.exceptions import ConvergenceWarning from sklearn.preprocessing import normalize @@ -43,15 +52,22 @@ def _daal4py_check(self, X, y, check_input): # check alpha if self.alpha == 0: - warnings.warn("With alpha=0, this algorithm does not converge " - "well. You are advised to use the LinearRegression " - "estimator", stacklevel=2) + warnings.warn( + "With alpha=0, this algorithm does not converge " + "well. You are advised to use the LinearRegression " + "estimator", + stacklevel=2, + ) # check l1_ratio - if not isinstance(self.l1_ratio, numbers.Number) or \ - self.l1_ratio < 0 or self.l1_ratio > 1: - raise ValueError("l1_ratio must be between 0 and 1; " - f"got l1_ratio={self.l1_ratio}") + if ( + not isinstance(self.l1_ratio, numbers.Number) + or self.l1_ratio < 0 + or self.l1_ratio > 1 + ): + raise ValueError( + "l1_ratio must be between 0 and 1; " f"got l1_ratio={self.l1_ratio}" + ) # check precompute if isinstance(self.precompute, np.ndarray): @@ -59,17 +75,18 @@ def _daal4py_check(self, X, y, check_input): check_array(self.precompute, dtype=_fptype) self.precompute = make2d(self.precompute) else: - if self.precompute not in [False, True, 'auto']: - raise ValueError("precompute should be one of True, False, " - "'auto' or array-like. Got %r" % self.precompute) + if self.precompute not in [False, True, "auto"]: + raise ValueError( + "precompute should be one of True, False, " + "'auto' or array-like. Got %r" % self.precompute + ) # check selection - if self.selection not in ['random', 'cyclic']: + if self.selection not in ["random", "cyclic"]: raise ValueError("selection should be either random or cyclic.") def _daal4py_fit_enet(self, X, y_, check_input): - # appropriate checks _daal4py_check(self, X, y_, check_input) X = make2d(X) @@ -79,18 +96,18 @@ def _daal4py_fit_enet(self, X, y_, check_input): # only for dual_gap computation, it is not required for Intel(R) oneAPI # Data Analytics Library self._X = X - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self.n_features_in_ = X.shape[1] self._y = y penalty_L1 = np.asarray(self.alpha * self.l1_ratio, dtype=X.dtype) penalty_L2 = np.asarray(self.alpha * (1.0 - self.l1_ratio), dtype=X.dtype) - if (penalty_L1.size != 1 or penalty_L2.size != 1): + if penalty_L1.size != 1 or penalty_L2.size != 1: raise ValueError("alpha or l1_ratio length is wrong") penalty_L1 = penalty_L1.reshape((1, -1)) penalty_L2 = penalty_L2.reshape((1, -1)) - #normalizing and centering + # normalizing and centering X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if y.ndim == 1: @@ -98,10 +115,10 @@ def _daal4py_fit_enet(self, X, y_, check_input): else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _normalize = False else: - _normalize = self._normalize if sklearn_check_version('1.0') else self.normalize + _normalize = self._normalize if sklearn_check_version("1.0") else self.normalize if self.fit_intercept: X_offset = np.average(X, axis=0) if _normalize: @@ -114,25 +131,29 @@ def _daal4py_fit_enet(self, X, y_, check_input): y = y - y_offset # only for compliance with Sklearn - if isinstance(self.precompute, np.ndarray) and self.fit_intercept and \ - not np.allclose(X_offset, np.zeros(X.shape[1])) or \ - _normalize and not np.allclose(X_scale, np.ones(X.shape[1])): - warnings.warn("Gram matrix was provided but X was centered" - " to fit intercept, " - "or X was normalized : recomputing Gram matrix.", - UserWarning) + if ( + isinstance(self.precompute, np.ndarray) + and self.fit_intercept + and not np.allclose(X_offset, np.zeros(X.shape[1])) + or _normalize + and not np.allclose(X_scale, np.ones(X.shape[1])) + ): + warnings.warn( + "Gram matrix was provided but X was centered" + " to fit intercept, " + "or X was normalized : recomputing Gram matrix.", + UserWarning, + ) mse_alg = daal4py.optimization_solver_mse( - numberOfTerms=X.shape[0], - fptype=_fptype, - method='defaultDense' + numberOfTerms=X.shape[0], fptype=_fptype, method="defaultDense" ) mse_alg.setup(X, y, None) cd_solver = daal4py.optimization_solver_coordinate_descent( function=mse_alg, fptype=_fptype, - method='defaultDense', + method="defaultDense", selection=self.selection, seed=0 if self.random_state is None else self.random_state, nIterations=self.max_iter, @@ -141,36 +162,37 @@ def _daal4py_fit_enet(self, X, y_, check_input): ) # set warm_start - if self.warm_start and hasattr(self, "coef_") and \ - isinstance(self.coef_, np.ndarray): + if self.warm_start and hasattr(self, "coef_") and isinstance(self.coef_, np.ndarray): n_rows = y.shape[1] n_cols = X.shape[1] + 1 inputArgument = np.zeros((n_rows, n_cols), dtype=_fptype) for i in range(n_rows): - inputArgument[i][0] = self.intercept_ if ( - n_rows == 1) else self.intercept_[i] - inputArgument[i][1:] = self.coef_[:].copy(order='C') if ( - n_rows == 1) else self.coef_[i, :].copy(order='C') + inputArgument[i][0] = self.intercept_ if (n_rows == 1) else self.intercept_[i] + inputArgument[i][1:] = ( + self.coef_[:].copy(order="C") + if (n_rows == 1) + else self.coef_[i, :].copy(order="C") + ) cd_solver.setup(inputArgument) - doUse_condition = self.copy_X is False or \ - (self.fit_intercept and _normalize and self.copy_X) + doUse_condition = self.copy_X is False or ( + self.fit_intercept and _normalize and self.copy_X + ) elastic_net_alg = daal4py.elastic_net_training( fptype=_fptype, - method='defaultDense', - interceptFlag=( - self.fit_intercept is True), - dataUseInComputation='doUse' if doUse_condition else 'doNotUse', + method="defaultDense", + interceptFlag=(self.fit_intercept is True), + dataUseInComputation="doUse" if doUse_condition else "doNotUse", penaltyL1=penalty_L1, penaltyL2=penalty_L2, - optimizationSolver=cd_solver + optimizationSolver=cd_solver, ) try: if isinstance(self.precompute, np.ndarray): elastic_net_res = elastic_net_alg.compute( - data=X, dependentVariables=y, gramMatrix=self.precompute) + data=X, dependentVariables=y, gramMatrix=self.precompute + ) else: - elastic_net_res = elastic_net_alg.compute( - data=X, dependentVariables=y) + elastic_net_res = elastic_net_alg.compute(data=X, dependentVariables=y) except RuntimeError: return None @@ -182,12 +204,13 @@ def _daal4py_fit_enet(self, X, y_, check_input): if self.fit_intercept and _normalize: elastic_net_model.Beta[:, 1:] = elastic_net_model.Beta[:, 1:] / X_scale elastic_net_model.Beta[:, 0] = ( - y_offset - np.dot(X_offset, elastic_net_model.Beta[:, 1:].T)).T + y_offset - np.dot(X_offset, elastic_net_model.Beta[:, 1:].T) + ).T coefs = elastic_net_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") # only for compliance with Sklearn if y.shape[1] == 1: @@ -205,8 +228,11 @@ def _daal4py_fit_enet(self, X, y_, check_input): # only for compliance with Sklearn if self.max_iter == n_iter + 1: - warnings.warn("Objective did not converge. You might want to " - "increase the number of iterations.", ConvergenceWarning) + warnings.warn( + "Objective did not converge. You might want to " + "increase the number of iterations.", + ConvergenceWarning, + ) return self @@ -216,14 +242,15 @@ def _daal4py_predict_enet(self, X): _fptype = getFPType(self.coef_) elastic_net_palg = daal4py.elastic_net_prediction( - fptype=_fptype, - method='defaultDense' + fptype=_fptype, method="defaultDense" ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): if self.n_features_in_ != X.shape[1]: - raise ValueError(f'X has {X.shape[1]} features, ' - f'but ElasticNet is expecting ' - f'{self.n_features_in_} features as input') + raise ValueError( + f"X has {X.shape[1]} features, " + f"but ElasticNet is expecting " + f"{self.n_features_in_} features as input" + ) elastic_net_res = elastic_net_palg.compute(X, self.daal_model_) res = elastic_net_res.prediction @@ -234,7 +261,6 @@ def _daal4py_predict_enet(self, X): def _daal4py_fit_lasso(self, X, y_, check_input): - # appropriate checks _daal4py_check(self, X, y_, check_input) X = make2d(X) @@ -244,11 +270,11 @@ def _daal4py_fit_lasso(self, X, y_, check_input): # only for dual_gap computation, it is not required for Intel(R) oneAPI # Data Analytics Library self._X = X - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self.n_features_in_ = X.shape[1] self._y = y - #normalizing and centering + # normalizing and centering X_offset = np.zeros(X.shape[1], dtype=X.dtype) X_scale = np.ones(X.shape[1], dtype=X.dtype) if y.ndim == 1: @@ -256,10 +282,10 @@ def _daal4py_fit_lasso(self, X, y_, check_input): else: y_offset = np.zeros(y.shape[1], dtype=X.dtype) - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _normalize = False else: - _normalize = self._normalize if sklearn_check_version('1.0') else self.normalize + _normalize = self._normalize if sklearn_check_version("1.0") else self.normalize if self.fit_intercept: X_offset = np.average(X, axis=0) if _normalize: @@ -272,61 +298,65 @@ def _daal4py_fit_lasso(self, X, y_, check_input): y = y - y_offset # only for compliance with Sklearn - if isinstance(self.precompute, np.ndarray) and \ - self.fit_intercept and not np.allclose( - X_offset, np.zeros(X.shape[1])) or \ - _normalize and not np.allclose(X_scale, np.ones(X.shape[1])): - warnings.warn("Gram matrix was provided but X was centered" - " to fit intercept, " - "or X was normalized : recomputing Gram matrix.", - UserWarning) + if ( + isinstance(self.precompute, np.ndarray) + and self.fit_intercept + and not np.allclose(X_offset, np.zeros(X.shape[1])) + or _normalize + and not np.allclose(X_scale, np.ones(X.shape[1])) + ): + warnings.warn( + "Gram matrix was provided but X was centered" + " to fit intercept, " + "or X was normalized : recomputing Gram matrix.", + UserWarning, + ) mse_alg = daal4py.optimization_solver_mse( - numberOfTerms=X.shape[0], - fptype=_fptype, - method='defaultDense' + numberOfTerms=X.shape[0], fptype=_fptype, method="defaultDense" ) mse_alg.setup(X, y, None) cd_solver = daal4py.optimization_solver_coordinate_descent( function=mse_alg, fptype=_fptype, - method='defaultDense', + method="defaultDense", selection=self.selection, seed=0 if self.random_state is None else self.random_state, nIterations=self.max_iter, positive=self.positive, - accuracyThreshold=self.tol + accuracyThreshold=self.tol, ) # set warm_start - if self.warm_start and hasattr(self, "coef_") and \ - isinstance(self.coef_, np.ndarray): + if self.warm_start and hasattr(self, "coef_") and isinstance(self.coef_, np.ndarray): n_rows = y.shape[1] n_cols = X.shape[1] + 1 inputArgument = np.zeros((n_rows, n_cols), dtype=_fptype) for i in range(n_rows): - inputArgument[i][0] = self.intercept_ if ( - n_rows == 1) else self.intercept_[i] - inputArgument[i][1:] = self.coef_[:].copy(order='C') if ( - n_rows == 1) else self.coef_[i, :].copy(order='C') + inputArgument[i][0] = self.intercept_ if (n_rows == 1) else self.intercept_[i] + inputArgument[i][1:] = ( + self.coef_[:].copy(order="C") + if (n_rows == 1) + else self.coef_[i, :].copy(order="C") + ) cd_solver.setup(inputArgument) - doUse_condition = self.copy_X is False or \ - (self.fit_intercept and _normalize and self.copy_X) + doUse_condition = self.copy_X is False or ( + self.fit_intercept and _normalize and self.copy_X + ) lasso_alg = daal4py.lasso_regression_training( fptype=_fptype, - method='defaultDense', + method="defaultDense", interceptFlag=(self.fit_intercept is True), - dataUseInComputation='doUse' if doUse_condition else 'doNotUse', - lassoParameters=np.asarray( - self.alpha, dtype=X.dtype - ).reshape((1, -1)), + dataUseInComputation="doUse" if doUse_condition else "doNotUse", + lassoParameters=np.asarray(self.alpha, dtype=X.dtype).reshape((1, -1)), optimizationSolver=cd_solver, ) try: if isinstance(self.precompute, np.ndarray): lasso_res = lasso_alg.compute( - data=X, dependentVariables=y, gramMatrix=self.precompute) + data=X, dependentVariables=y, gramMatrix=self.precompute + ) else: lasso_res = lasso_alg.compute(data=X, dependentVariables=y) except RuntimeError: @@ -339,13 +369,14 @@ def _daal4py_fit_lasso(self, X, y_, check_input): # update coefficients if normalizing and centering if self.fit_intercept and _normalize: lasso_model.Beta[:, 1:] = lasso_model.Beta[:, 1:] / X_scale - lasso_model.Beta[:, 0] = \ - (y_offset - np.dot(X_offset, lasso_model.Beta[:, 1:].T)).T + lasso_model.Beta[:, 0] = ( + y_offset - np.dot(X_offset, lasso_model.Beta[:, 1:].T) + ).T coefs = lasso_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") # only for compliance with Sklearn if y.shape[1] == 1: @@ -362,9 +393,12 @@ def _daal4py_fit_lasso(self, X, y_, check_input): self.n_iter_ = np.full(y.shape[1], n_iter) # only for compliance with Sklearn - if (self.max_iter == n_iter + 1): - warnings.warn("Objective did not converge. You might want to " - "increase the number of iterations.", ConvergenceWarning) + if self.max_iter == n_iter + 1: + warnings.warn( + "Objective did not converge. You might want to " + "increase the number of iterations.", + ConvergenceWarning, + ) return self @@ -374,14 +408,15 @@ def _daal4py_predict_lasso(self, X): _fptype = getFPType(self.coef_) lasso_palg = daal4py.lasso_regression_prediction( - fptype=_fptype, - method='defaultDense' + fptype=_fptype, method="defaultDense" ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): if self.n_features_in_ != X.shape[1]: - raise ValueError(f'X has {X.shape[1]} features, ' - f'but Lasso is expecting ' - f'{self.n_features_in_} features as input') + raise ValueError( + f"X has {X.shape[1]} features, " + f"but Lasso is expecting " + f"{self.n_features_in_} features as input" + ) lasso_res = lasso_palg.compute(X, self.daal_model_) res = lasso_res.prediction @@ -392,11 +427,11 @@ def _daal4py_predict_lasso(self, X): def _fit(self, X, y, sample_weight=None, check_input=True): - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - elif sklearn_check_version('1.1'): + elif sklearn_check_version("1.1"): check_scalar( self.alpha, "alpha", @@ -433,7 +468,7 @@ def _fit(self, X, y, sample_weight=None, check_input=True): X, y, copy=False, - accept_sparse='csc', + accept_sparse="csc", dtype=[np.float64, np.float32], multi_output=True, y_numeric=True, @@ -441,37 +476,44 @@ def _fit(self, X, y, sample_weight=None, check_input=True): y = check_array(y, copy=False, dtype=X.dtype.type, ensure_2d=False) if not sp.issparse(X): - self.fit_shape_good_for_daal_ = \ + self.fit_shape_good_for_daal_ = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) else: self.fit_shape_good_for_daal_ = False class_name = self.__class__.__name__ - class_inst = ElasticNet if class_name == 'ElasticNet' else Lasso + class_inst = ElasticNet if class_name == "ElasticNet" else Lasso _function_name = f"sklearn.linear_model.{class_name}.fit" - _patching_status = PatchingConditionsChain( - _function_name) - _dal_ready = _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.fit_shape_good_for_daal_, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples."), - (X.dtype == np.float64 or X.dtype == np.float32, - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported."), - (sample_weight is None, "Sample weights are not supported.")]) + _patching_status = PatchingConditionsChain(_function_name) + _dal_ready = _patching_status.and_conditions( + [ + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.fit_shape_good_for_daal_, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ( + X.dtype == np.float64 or X.dtype == np.float32, + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ), + (sample_weight is None, "Sample weights are not supported."), + ] + ) _patching_status.write_log() if not _dal_ready: - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): res_new = super(class_inst, self).fit( - X, y, sample_weight=sample_weight, check_input=check_input) + X, y, sample_weight=sample_weight, check_input=check_input + ) else: - res_new = super(class_inst, self).fit( - X, y, check_input=check_input) + res_new = super(class_inst, self).fit(X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new self.n_iter_ = None @@ -481,17 +523,14 @@ def _fit(self, X, y, sample_weight=None, check_input=True): # only for compliance with Sklearn, # this assert is not required for Intel(R) oneAPI Data # Analytics Library - print(type(X), X.flags['F_CONTIGUOUS']) - if isinstance(X, np.ndarray) and \ - X.flags['F_CONTIGUOUS'] is False: + print(type(X), X.flags["F_CONTIGUOUS"]) + if isinstance(X, np.ndarray) and X.flags["F_CONTIGUOUS"] is False: # print(X.flags) raise ValueError("ndarray is not Fortran contiguous") - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( - self.normalize, - default=False, - estimator_name=class_name + self.normalize, default=False, estimator_name=class_name ) # only for pass tests @@ -507,29 +546,27 @@ def _fit(self, X, y, sample_weight=None, check_input=True): else: res = _daal4py_fit_lasso(self, X, y, check_input=check_input) if res is None: - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - logging.info( - _function_name + ": " + get_patch_message("sklearn_after_daal") - ) - if sklearn_check_version('0.23'): + logging.info(_function_name + ": " + get_patch_message("sklearn_after_daal")) + if sklearn_check_version("0.23"): res_new = super(class_inst, self).fit( - X, y, sample_weight=sample_weight, check_input=check_input) + X, y, sample_weight=sample_weight, check_input=check_input + ) else: - res_new = super(class_inst, self).fit( - X, y, check_input=check_input) + res_new = super(class_inst, self).fit(X, y, check_input=check_input) self._gap = res_new.dual_gap_ return res_new return res def _dual_gap(self): - if (self._gap is None): + if self._gap is None: l1_reg = self.alpha * self.l1_ratio * self._X.shape[0] l2_reg = self.alpha * (1.0 - self.l1_ratio) * self._X.shape[0] n_targets = self._y.shape[1] - if (n_targets == 1): + if n_targets == 1: self._gap = self.tol + 1.0 X_offset = np.average(self._X, axis=0) y_offset = np.average(self._y, axis=0) @@ -538,11 +575,10 @@ def _dual_gap(self): XtA = np.dot((self._X - X_offset).T, R) - l2_reg * coef R_norm2 = np.dot(R.T, R) coef_norm2 = np.dot(self.coef_, self.coef_) - dual_norm_XtA = np.max( - XtA) if self.positive else np.max(np.abs(XtA)) + dual_norm_XtA = np.max(XtA) if self.positive else np.max(np.abs(XtA)) if dual_norm_XtA > l1_reg: const = l1_reg / dual_norm_XtA - A_norm2 = R_norm2 * (const ** 2) + A_norm2 = R_norm2 * (const**2) self._gap = 0.5 * (R_norm2 + A_norm2) else: const = 1.0 @@ -550,7 +586,7 @@ def _dual_gap(self): l1_norm = np.sum(np.abs(self.coef_)) tmp = l1_reg * l1_norm tmp -= const * np.dot(R.T, (self._y - y_offset)) - tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2 + tmp += 0.5 * l2_reg * (1 + const**2) * coef_norm2 self._gap += tmp self._gap = self._gap[0][0] else: @@ -558,17 +594,16 @@ def _dual_gap(self): X_offset = np.average(self._X, axis=0) y_offset = np.average(self._y, axis=0) for k in range(n_targets): - R = (self._y[:, k] - y_offset[k]) - \ - np.dot((self._X - X_offset), self.coef_[k, :].T) - XtA = np.dot((self._X - X_offset).T, R) - \ - l2_reg * self.coef_[k, :].T + R = (self._y[:, k] - y_offset[k]) - np.dot( + (self._X - X_offset), self.coef_[k, :].T + ) + XtA = np.dot((self._X - X_offset).T, R) - l2_reg * self.coef_[k, :].T R_norm2 = np.dot(R.T, R) coef_norm2 = np.dot(self.coef_[k, :], self.coef_[k, :].T) - dual_norm_XtA = np.max( - XtA) if self.positive else np.max(np.abs(XtA)) + dual_norm_XtA = np.max(XtA) if self.positive else np.max(np.abs(XtA)) if dual_norm_XtA > l1_reg: const = l1_reg / dual_norm_XtA - A_norm2 = R_norm2 * (const ** 2) + A_norm2 = R_norm2 * (const**2) self._gap[k] = 0.5 * (R_norm2 + A_norm2) else: const = 1.0 @@ -576,7 +611,7 @@ def _dual_gap(self): l1_norm = np.sum(np.abs(self.coef_[k, :])) tmp = l1_reg * l1_norm tmp -= const * np.dot(R.T, (self._y[:, k] - y_offset[k])) - tmp += 0.5 * l2_reg * (1 + const ** 2) * coef_norm2 + tmp += 0.5 * l2_reg * (1 + const**2) * coef_norm2 self._gap[k] += tmp return self._gap @@ -584,7 +619,7 @@ def _dual_gap(self): class ElasticNet(ElasticNet_original): __doc__ = ElasticNet_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**ElasticNet_original._parameter_constraints} def __init__( @@ -599,7 +634,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): super(ElasticNet, self).__init__( alpha=alpha, @@ -614,13 +649,15 @@ def __init__( random_state=random_state, selection=selection, ) + else: + def __init__( self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, - normalize="deprecated" if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, precompute=False, max_iter=1000, copy_X=True, @@ -628,7 +665,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): super(ElasticNet, self).__init__( alpha=alpha, @@ -645,7 +682,8 @@ def __init__( selection=selection, ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): + @support_usm_ndarray() def fit(self, X, y, sample_weight=None, check_input=True): """ @@ -685,7 +723,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): initial data in memory directly using that format. """ return _fit(self, X, y, sample_weight=sample_weight, check_input=check_input) + else: + @support_usm_ndarray() def fit(self, X, y, check_input=True): """ @@ -730,25 +770,30 @@ def predict(self, X): Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, - accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float64, np.float32] + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] ) - good_shape_for_daal = \ + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) _patching_status = PatchingConditionsChain( - "sklearn.linear_model.ElasticNet.predict") - _dal_ready = _patching_status.and_conditions([ - (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples.")]) + "sklearn.linear_model.ElasticNet.predict" + ) + _dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: @@ -771,7 +816,8 @@ def dual_gap_(self): class Lasso(Lasso_original): __doc__ = Lasso_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): + def __init__( self, alpha=1.0, @@ -783,7 +829,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): self.l1_ratio = 1.0 super().__init__( @@ -798,12 +844,14 @@ def __init__( random_state=random_state, selection=selection, ) + else: + def __init__( self, alpha=1.0, fit_intercept=True, - normalize="deprecated" if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, precompute=False, copy_X=True, max_iter=1000, @@ -811,7 +859,7 @@ def __init__( warm_start=False, positive=False, random_state=None, - selection='cyclic', + selection="cyclic", ): self.l1_ratio = 1.0 super().__init__( @@ -828,7 +876,8 @@ def __init__( selection=selection, ) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): + @support_usm_ndarray() def fit(self, X, y, sample_weight=None, check_input=True): """ @@ -868,7 +917,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): initial data in memory directly using that format. """ return _fit(self, X, y, sample_weight, check_input) + else: + @support_usm_ndarray() def fit(self, X, y, check_input=True): """ @@ -912,24 +963,27 @@ def predict(self, X): C : array, shape = (n_samples,) Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, - accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float64, np.float32] + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] ) - good_shape_for_daal = \ + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) - _patching_status = PatchingConditionsChain( - "sklearn.linear_model.Lasso.predict") - _dal_ready = _patching_status.and_conditions([ - (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples.")]) + _patching_status = PatchingConditionsChain("sklearn.linear_model.Lasso.predict") + _dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: diff --git a/daal4py/sklearn/linear_model/_linear.py b/daal4py/sklearn/linear_model/_linear.py index 9883958fdb..d7044b585d 100644 --- a/daal4py/sklearn/linear_model/_linear.py +++ b/daal4py/sklearn/linear_model/_linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from scipy import sparse as sp +from sklearn.linear_model import LinearRegression as LinearRegression_original +from sklearn.utils import check_array -from ..utils.validation import _daal_check_array, _daal_check_X_y -from ..utils.base import _daal_validate_data -from .._utils import sklearn_check_version from .._device_offload import support_usm_ndarray -from sklearn.utils import check_array +from .._utils import sklearn_check_version +from ..utils.base import _daal_validate_data +from ..utils.validation import _daal_check_array, _daal_check_X_y -from sklearn.linear_model import LinearRegression as LinearRegression_original -if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize try: @@ -32,15 +32,18 @@ except ImportError: from sklearn.externals.joblib import Parallel, delayed +import logging + import daal4py + from .._utils import ( - make2d, - getFPType, + PatchingConditionsChain, + get_dtype, get_patch_message, + getFPType, is_DataFrame, - get_dtype, - PatchingConditionsChain) -import logging + make2d, +) def _daal4py_fit(self, X, y_): @@ -49,18 +52,14 @@ def _daal4py_fit(self, X, y_): try: lr_algorithm = daal4py.linear_regression_training( - fptype=X_fptype, - interceptFlag=bool(self.fit_intercept), - method='defaultDense' + fptype=X_fptype, interceptFlag=bool(self.fit_intercept), method="defaultDense" ) lr_res = lr_algorithm.compute(X, y) except RuntimeError: # Normal system is not invertible, try QR try: lr_algorithm = daal4py.linear_regression_training( - fptype=X_fptype, - interceptFlag=bool(self.fit_intercept), - method='qrDense' + fptype=X_fptype, interceptFlag=bool(self.fit_intercept), method="qrDense" ) lr_res = lr_algorithm.compute(X, y) except RuntimeError: @@ -71,8 +70,8 @@ def _daal4py_fit(self, X, y_): self.daal_model_ = lr_model coefs = lr_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") self.n_features_in_ = X.shape[1] self.rank_ = X.shape[1] self.singular_ = np.full((X.shape[1],), np.nan) @@ -87,21 +86,19 @@ def _daal4py_fit(self, X, y_): def _daal4py_predict(self, X): X = make2d(X) _fptype = getFPType(self.coef_) - lr_pred = daal4py.linear_regression_prediction( - fptype=_fptype, - method='defaultDense' - ) - if sklearn_check_version('0.23'): + lr_pred = daal4py.linear_regression_prediction(fptype=_fptype, method="defaultDense") + if sklearn_check_version("0.23"): if X.shape[1] != self.n_features_in_: raise ValueError( - f'X has {X.shape[1]} features, ' - f'but LinearRegression is expecting ' - f'{self.n_features_in_} features as input') + f"X has {X.shape[1]} features, " + f"but LinearRegression is expecting " + f"{self.n_features_in_} features as input" + ) try: lr_res = lr_pred.compute(X, self.daal_model_) except RuntimeError: raise ValueError( - f'Input data shape {X.shape} is inconsistent with the trained model' + f"Input data shape {X.shape} is inconsistent with the trained model" ) res = lr_res.prediction if res.shape[1] == 1 and self.coef_.ndim == 1: @@ -134,13 +131,13 @@ def _fit_linear(self, X, y, sample_weight=None): """ params = { - 'X': X, - 'y': y, - 'accept_sparse': ['csr', 'csc', 'coo'], - 'y_numeric': True, - 'multi_output': True, + "X": X, + "y": y, + "accept_sparse": ["csr", "csc", "coo"], + "y_numeric": True, + "multi_output": True, } - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): X, y = _daal_validate_data( self, dtype=[np.float64, np.float32], @@ -151,23 +148,35 @@ def _fit_linear(self, X, y, sample_weight=None): dtype = get_dtype(X) - self.fit_shape_good_for_daal_ = \ - bool(X.shape[0] > X.shape[1] + int(self.fit_intercept)) + self.fit_shape_good_for_daal_ = bool( + X.shape[0] > X.shape[1] + int(self.fit_intercept) + ) _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LinearRegression.fit") - _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.fit_shape_good_for_daal_, - "The shape of X does not satisfy oneDAL requirements: " - "number of features + 1 >= number of samples."), - (sample_weight is None, "Sample weights are not supported.")]) - - if sklearn_check_version('0.22') and not sklearn_check_version('0.23'): - _patching_status.and_conditions([ - (dtype in [np.float32, np.float64], - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported.")]) + "sklearn.linear_model.LinearRegression.fit" + ) + _patching_status.and_conditions( + [ + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.fit_shape_good_for_daal_, + "The shape of X does not satisfy oneDAL requirements: " + "number of features + 1 >= number of samples.", + ), + (sample_weight is None, "Sample weights are not supported."), + ] + ) + + if sklearn_check_version("0.22") and not sklearn_check_version("0.23"): + _patching_status.and_conditions( + [ + ( + dtype in [np.float32, np.float64], + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ) + ] + ) _dal_ready = _patching_status.get_status() _patching_status.write_log() @@ -177,7 +186,8 @@ def _fit_linear(self, X, y, sample_weight=None): return res logging.info( "sklearn.linar_model.LinearRegression." - "fit: " + get_patch_message("sklearn_after_daal")) + "fit: " + get_patch_message("sklearn_after_daal") + ) return super(LinearRegression, self).fit( X, @@ -199,30 +209,44 @@ def _predict_linear(self, X): C : array, shape = (n_samples,) Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) is_df = is_DataFrame(X) - if sklearn_check_version('0.23'): - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + if sklearn_check_version("0.23"): + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) X = np.asarray(X) if not sp.issparse(X) and not is_df else X - good_shape_for_daal = \ + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] > X.shape[1] else False + ) _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LinearRegression.predict") - _dal_ready = _patching_status.and_conditions([ - (hasattr(self, 'daal_model_'), 'oneDAL model was not trained.'), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "Number of features >= number of samples."), - (not hasattr(self, 'sample_weight_') or self.sample_weight_ is None, - "Sample weights are not supported.")]) - if hasattr(self, 'fit_shape_good_for_daal_'): - _dal_ready = _patching_status.and_conditions([ - (self.fit_shape_good_for_daal_, - "The shape of X (fitting) does not satisfy oneDAL requirements: " - "Number of features + 1 >= number of samples.")]) + "sklearn.linear_model.LinearRegression.predict" + ) + _dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "Number of features >= number of samples.", + ), + ( + not hasattr(self, "sample_weight_") or self.sample_weight_ is None, + "Sample weights are not supported.", + ), + ] + ) + if hasattr(self, "fit_shape_good_for_daal_"): + _dal_ready = _patching_status.and_conditions( + [ + ( + self.fit_shape_good_for_daal_, + "The shape of X (fitting) does not satisfy oneDAL requirements: " + "Number of features + 1 >= number of samples.", + ) + ] + ) _patching_status.write_log() if not _dal_ready: return self._decision_function(X) @@ -233,7 +257,7 @@ def _predict_linear(self, X): class LinearRegression(LinearRegression_original): __doc__ = LinearRegression_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **LinearRegression_original._parameter_constraints } @@ -251,11 +275,13 @@ def __init__( n_jobs=n_jobs, positive=positive, ) - elif sklearn_check_version('0.24'): + + elif sklearn_check_version("0.24"): + def __init__( self, fit_intercept=True, - normalize='deprecated' if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, copy_X=True, n_jobs=None, positive=False, @@ -267,7 +293,9 @@ def __init__( n_jobs=n_jobs, positive=positive, ) + else: + def __init__( self, fit_intercept=True, @@ -279,7 +307,7 @@ def __init__( fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, - n_jobs=n_jobs + n_jobs=n_jobs, ) @support_usm_ndarray() @@ -305,23 +333,29 @@ def fit(self, X, y, sample_weight=None): self : object Fitted Estimator. """ - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( self.normalize, default=False, estimator_name=self.__class__.__name__, ) - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - if sklearn_check_version('0.24'): + if sklearn_check_version("0.24"): _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LinearRegression.fit") - _dal_ready = _patching_status.and_conditions([ - (self.positive is False, - "Forced positive coefficients are not supported.")]) + "sklearn.linear_model.LinearRegression.fit" + ) + _dal_ready = _patching_status.and_conditions( + [ + ( + self.positive is False, + "Forced positive coefficients are not supported.", + ) + ] + ) if not _dal_ready: _patching_status.write_log() return super(LinearRegression, self).fit( diff --git a/daal4py/sklearn/linear_model/_ridge.py b/daal4py/sklearn/linear_model/_ridge.py index b3bf466aca..06fd2ade02 100644 --- a/daal4py/sklearn/linear_model/_ridge.py +++ b/daal4py/sklearn/linear_model/_ridge.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import logging import numbers + import numpy as np from scipy import sparse as sp -from sklearn.utils import check_array, check_X_y -from sklearn.linear_model._ridge import _BaseRidge from sklearn.linear_model._ridge import Ridge as Ridge_original +from sklearn.linear_model._ridge import _BaseRidge +from sklearn.utils import check_array, check_X_y import daal4py -from .._utils import ( - make2d, getFPType, get_patch_message, sklearn_check_version, - PatchingConditionsChain) -from .._device_offload import support_usm_ndarray -import logging -if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): +from .._device_offload import support_usm_ndarray +from .._utils import ( + PatchingConditionsChain, + get_patch_message, + getFPType, + make2d, + sklearn_check_version, +) + +if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar @@ -44,14 +50,15 @@ def _daal4py_fit(self, X, y_): if ridge_params.size != 1 and ridge_params.size != y.shape[1]: raise ValueError( "Number of targets and number of penalties do not correspond: " - f"{ridge_params.size} != {y.shape[1]}") + f"{ridge_params.size} != {y.shape[1]}" + ) ridge_params = ridge_params.reshape((1, -1)) ridge_alg = daal4py.ridge_regression_training( fptype=_fptype, - method='defaultDense', + method="defaultDense", interceptFlag=(self.fit_intercept is True), - ridgeParameters=ridge_params + ridgeParameters=ridge_params, ) try: ridge_res = ridge_alg.compute(X, y) @@ -62,8 +69,8 @@ def _daal4py_fit(self, X, y_): self.daal_model_ = ridge_model coefs = ridge_model.Beta - self.intercept_ = coefs[:, 0].copy(order='C') - self.coef_ = coefs[:, 1:].copy(order='C') + self.intercept_ = coefs[:, 0].copy(order="C") + self.coef_ = coefs[:, 1:].copy(order="C") if self.coef_.shape[0] == 1 and y_.ndim == 1: self.coef_ = np.ravel(self.coef_) @@ -77,13 +84,12 @@ def _daal4py_predict(self, X): _fptype = getFPType(self.coef_) ridge_palg = daal4py.ridge_regression_prediction( - fptype=_fptype, - method='defaultDense' + fptype=_fptype, method="defaultDense" ) if self.n_features_in_ != X.shape[1]: raise ValueError( - f'X has {X.shape[1]} features, ' - f'but Ridge is expecting {self.n_features_in_} features as input' + f"X has {X.shape[1]} features, " + f"but Ridge is expecting {self.n_features_in_} features as input" ) ridge_res = ridge_palg.compute(X, self.daal_model_) @@ -112,17 +118,15 @@ def _fit_ridge(self, X, y, sample_weight=None): ------- self : returns an instance of self. """ - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( - self.normalize, - default=False, - estimator_name=self.__class__.__name__ + self.normalize, default=False, estimator_name=self.__class__.__name__ ) - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - elif sklearn_check_version('1.1'): + elif sklearn_check_version("1.1"): if self.max_iter is not None: self.max_iter = check_scalar( self.max_iter, "max_iter", target_type=numbers.Integral, min_val=1 @@ -137,40 +141,57 @@ def _fit_ridge(self, X, y, sample_weight=None): include_boundaries="left", ) - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=[np.float64, np.float32], - multi_output=True, y_numeric=True) + X, y = check_X_y( + X, + y, + ["csr", "csc", "coo"], + dtype=[np.float64, np.float32], + multi_output=True, + y_numeric=True, + ) self.n_features_in_ = X.shape[1] self.sample_weight_ = sample_weight self.fit_shape_good_for_daal_ = True if X.shape[0] >= X.shape[1] else False - _patching_status = PatchingConditionsChain( - "sklearn.linear_model.Ridge.fit") - _dal_ready = _patching_status.and_conditions([ - (self.solver == 'auto', - f"'{self.solver}' solver is not supported. " - "Only 'auto' solver is supported."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.fit_shape_good_for_daal_, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples."), - (X.dtype == np.float64 or X.dtype == np.float32, - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported."), - (sample_weight is None, "Sample weights are not supported."), - (not (hasattr(self, 'positive') and self.positive), - "Forced positive coefficients are not supported.")]) + _patching_status = PatchingConditionsChain("sklearn.linear_model.Ridge.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.solver == "auto", + f"'{self.solver}' solver is not supported. " + "Only 'auto' solver is supported.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.fit_shape_good_for_daal_, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ( + X.dtype == np.float64 or X.dtype == np.float32, + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ), + (sample_weight is None, "Sample weights are not supported."), + ( + not (hasattr(self, "positive") and self.positive), + "Forced positive coefficients are not supported.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ return super(Ridge, self).fit(X, y, sample_weight=sample_weight) self.n_iter_ = None res = _daal4py_fit(self, X, y) if res is None: logging.info( - "sklearn.linear_model.Ridge.fit: " + get_patch_message("sklearn_after_daal")) - if hasattr(self, 'daal_model_'): + "sklearn.linear_model.Ridge.fit: " + get_patch_message("sklearn_after_daal") + ) + if hasattr(self, "daal_model_"): del self.daal_model_ return super(Ridge, self).fit(X, y, sample_weight=sample_weight) return res @@ -189,30 +210,42 @@ def _predict_ridge(self, X): C : array, shape = (n_samples,) Returns predicted values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) X = check_array( - X, accept_sparse=['csr', 'csc', 'coo'], dtype=[np.float64, np.float32]) - good_shape_for_daal = \ + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float64, np.float32] + ) + good_shape_for_daal = ( True if X.ndim <= 1 else True if X.shape[0] >= X.shape[1] else False + ) - _patching_status = PatchingConditionsChain( - "sklearn.linear_model.Ridge.predict") - _dal_ready = _patching_status.and_conditions([ - (self.solver == 'auto', - f"'{self.solver}' solver is not supported. " - "Only 'auto' solver is supported."), - (hasattr(self, 'daal_model_'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (good_shape_for_daal, - "The shape of X does not satisfy oneDAL requirements: " - "number of features > number of samples."), - (X.dtype == np.float64 or X.dtype == np.float32, - f"'{X.dtype}' X data type is not supported. " - "Only np.float32 and np.float64 are supported."), - (not hasattr(self, 'sample_weight_') or self.sample_weight_ is None, - "Sample weights are not supported.")]) + _patching_status = PatchingConditionsChain("sklearn.linear_model.Ridge.predict") + _dal_ready = _patching_status.and_conditions( + [ + ( + self.solver == "auto", + f"'{self.solver}' solver is not supported. " + "Only 'auto' solver is supported.", + ), + (hasattr(self, "daal_model_"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + good_shape_for_daal, + "The shape of X does not satisfy oneDAL requirements: " + "number of features > number of samples.", + ), + ( + X.dtype == np.float64 or X.dtype == np.float32, + f"'{X.dtype}' X data type is not supported. " + "Only np.float32 and np.float64 are supported.", + ), + ( + not hasattr(self, "sample_weight_") or self.sample_weight_ is None, + "Sample weights are not supported.", + ), + ] + ) _patching_status.write_log() if not _dal_ready: @@ -223,7 +256,7 @@ def _predict_ridge(self, X): class Ridge(Ridge_original, _BaseRidge): __doc__ = Ridge_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**Ridge_original._parameter_constraints} def __init__( @@ -245,12 +278,14 @@ def __init__( self.solver = solver self.positive = positive self.random_state = random_state - elif sklearn_check_version('1.0'): + + elif sklearn_check_version("1.0"): + def __init__( self, alpha=1.0, fit_intercept=True, - normalize='deprecated', + normalize="deprecated", copy_X=True, max_iter=None, tol=1e-3, @@ -267,7 +302,9 @@ def __init__( self.solver = solver self.positive = positive self.random_state = random_state + else: + def __init__( self, alpha=1.0, diff --git a/daal4py/sklearn/linear_model/coordinate_descent.py b/daal4py/sklearn/linear_model/coordinate_descent.py index 2519306665..a70fcb3f80 100755 --- a/daal4py/sklearn/linear_model/coordinate_descent.py +++ b/daal4py/sklearn/linear_model/coordinate_descent.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._coordinate_descent import * diff --git a/daal4py/sklearn/linear_model/linear.py b/daal4py/sklearn/linear_model/linear.py index 5f1970460a..5325b86de5 100644 --- a/daal4py/sklearn/linear_model/linear.py +++ b/daal4py/sklearn/linear_model/linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._linear import * diff --git a/daal4py/sklearn/linear_model/logistic_loss.py b/daal4py/sklearn/linear_model/logistic_loss.py index 9aea83cfe8..a1a1c4cba6 100644 --- a/daal4py/sklearn/linear_model/logistic_loss.py +++ b/daal4py/sklearn/linear_model/logistic_loss.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,29 +12,39 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np import daal4py -from .._utils import (make2d, getFPType) + +from .._utils import getFPType, make2d def _resultsToCompute_string(value=True, gradient=True, hessian=False): results_needed = [] if value: - results_needed.append('value') + results_needed.append("value") if gradient: - results_needed.append('gradient') + results_needed.append("gradient") if hessian: - results_needed.append('hessian') - - return '|'.join(results_needed) - - -def _daal4py_logistic_loss_extra_args(nClasses_unused, beta, X, y, - l1=0.0, l2=0.0, fit_intercept=True, - value=True, gradient=True, hessian=False): + results_needed.append("hessian") + + return "|".join(results_needed) + + +def _daal4py_logistic_loss_extra_args( + nClasses_unused, + beta, + X, + y, + l1=0.0, + l2=0.0, + fit_intercept=True, + value=True, + gradient=True, + hessian=False, +): X = make2d(X) nSamples, nFeatures = X.shape @@ -43,26 +53,35 @@ def _daal4py_logistic_loss_extra_args(nClasses_unused, beta, X, y, n = X.shape[0] results_to_compute = _resultsToCompute_string( - value=value, gradient=gradient, hessian=hessian) - - objective_function_algorithm_instance = \ - daal4py.optimization_solver_logistic_loss( - numberOfTerms=n, - fptype=getFPType(X), - method='defaultDense', - interceptFlag=fit_intercept, - penaltyL1=l1 / n, - penaltyL2=l2 / n, - resultsToCompute=results_to_compute - ) + value=value, gradient=gradient, hessian=hessian + ) + + objective_function_algorithm_instance = daal4py.optimization_solver_logistic_loss( + numberOfTerms=n, + fptype=getFPType(X), + method="defaultDense", + interceptFlag=fit_intercept, + penaltyL1=l1 / n, + penaltyL2=l2 / n, + resultsToCompute=results_to_compute, + ) objective_function_algorithm_instance.setup(X, y, beta) return (objective_function_algorithm_instance, X, y, n) -def _daal4py_cross_entropy_loss_extra_args(nClasses, beta, X, y, - l1=0.0, l2=0.0, fit_intercept=True, - value=True, gradient=True, hessian=False): +def _daal4py_cross_entropy_loss_extra_args( + nClasses, + beta, + X, + y, + l1=0.0, + l2=0.0, + fit_intercept=True, + value=True, + gradient=True, + hessian=False, +): X = make2d(X) nSamples, nFeatures = X.shape y = make2d(y) @@ -70,19 +89,21 @@ def _daal4py_cross_entropy_loss_extra_args(nClasses, beta, X, y, n = X.shape[0] results_to_compute = _resultsToCompute_string( - value=value, gradient=gradient, hessian=hessian) + value=value, gradient=gradient, hessian=hessian + ) - objective_function_algorithm_instance = \ + objective_function_algorithm_instance = ( daal4py.optimization_solver_cross_entropy_loss( nClasses=nClasses, numberOfTerms=n, fptype=getFPType(X), - method='defaultDense', + method="defaultDense", interceptFlag=fit_intercept, penaltyL1=l1 / n, penaltyL2=l2 / n, - resultsToCompute=results_to_compute + resultsToCompute=results_to_compute, ) + ) objective_function_algorithm_instance.setup(X, y, beta) return (objective_function_algorithm_instance, X, y, n) @@ -150,6 +171,7 @@ def hessp(v): res[1:] = np.dot(pp0, X) res[1:] += (2 * l2) * v[1:] return res + else: # dealing with multi-class logistic regression beta__ = beta_.reshape((-1, 1 + X.shape[1])) # (nClasses, nSamples) diff --git a/daal4py/sklearn/linear_model/logistic_path.py b/daal4py/sklearn/linear_model/logistic_path.py index 85d4165e32..93ada4aada 100755 --- a/daal4py/sklearn/linear_model/logistic_path.py +++ b/daal4py/sklearn/linear_model/logistic_path.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,39 +12,48 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import daal4py as d4p -import numpy as np -import scipy.sparse as sparse -import scipy.optimize as optimize import numbers -from .._utils import ( - getFPType, sklearn_check_version, PatchingConditionsChain) -from .logistic_loss import (_daal4py_loss_and_grad, - _daal4py_logistic_loss_extra_args, - _daal4py_cross_entropy_loss_extra_args, - _daal4py_loss_, _daal4py_grad_, - _daal4py_grad_hess_) +import numpy as np +import scipy.optimize as optimize +import scipy.sparse as sparse import sklearn.linear_model._logistic as logistic_module - -from sklearn.utils import (check_array, - check_consistent_length, - compute_class_weight, - check_random_state) -from sklearn.utils.validation import _check_sample_weight, check_is_fitted from sklearn.linear_model._sag import sag_solver -from sklearn.utils.optimize import _newton_cg, _check_optimize_result -if sklearn_check_version('1.1'): - from sklearn.linear_model._linear_loss import LinearModelLoss +from sklearn.utils import ( + check_array, + check_consistent_length, + check_random_state, + compute_class_weight, +) +from sklearn.utils.optimize import _check_optimize_result, _newton_cg +from sklearn.utils.validation import _check_sample_weight, check_is_fitted + +import daal4py as d4p + +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version +from .logistic_loss import ( + _daal4py_cross_entropy_loss_extra_args, + _daal4py_grad_, + _daal4py_grad_hess_, + _daal4py_logistic_loss_extra_args, + _daal4py_loss_, + _daal4py_loss_and_grad, +) + +if sklearn_check_version("1.1"): from sklearn._loss.loss import HalfBinomialLoss, HalfMultinomialLoss + from sklearn.linear_model._linear_loss import LinearModelLoss + from sklearn.linear_model._logistic import _LOGISTIC_SOLVER_CONVERGENCE_MSG + from sklearn.linear_model._logistic import ( + LogisticRegression as LogisticRegression_original, + ) from sklearn.linear_model._logistic import ( - _check_solver, _check_multi_class, + _check_solver, _fit_liblinear, - _LOGISTIC_SOLVER_CONVERGENCE_MSG, - LogisticRegression as LogisticRegression_original) + ) else: from sklearn.linear_model._logistic import ( _check_solver, @@ -57,9 +66,12 @@ _multinomial_loss_grad, _multinomial_grad_hess, _LOGISTIC_SOLVER_CONVERGENCE_MSG, - LogisticRegression as LogisticRegression_original) + LogisticRegression as LogisticRegression_original, + ) + from sklearn.linear_model._logistic import _logistic_regression_path as lr_path_original -from sklearn.preprocessing import LabelEncoder, LabelBinarizer +from sklearn.preprocessing import LabelBinarizer, LabelEncoder + from .._device_offload import support_usm_ndarray @@ -73,13 +85,13 @@ def __logistic_regression_path( max_iter=100, tol=1e-4, verbose=0, - solver='lbfgs', + solver="lbfgs", coef=None, class_weight=None, dual=False, - penalty='l2', - intercept_scaling=1., - multi_class='warn', + penalty="l2", + intercept_scaling=1.0, + multi_class="warn", random_state=None, check_input=True, max_squared_sum=None, @@ -237,22 +249,35 @@ def __logistic_regression_path( The "copy" parameter was removed. """ _patching_status = PatchingConditionsChain( - "sklearn.linear_model.LogisticRegression.fit") + "sklearn.linear_model.LogisticRegression.fit" + ) # TODO: remove this fallback workaround after # logistic path is reworked to align with sklearn 1.2 - _dal_ready = _patching_status.and_conditions([ - (not (sklearn_check_version('1.2') and solver == 'newton-cholesky'), - f"'{solver}' solver is not supported. " - "Only 'lbfgs' and 'newton-cg' solvers are supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + not (sklearn_check_version("1.2") and solver == "newton-cholesky"), + f"'{solver}' solver is not supported. " + "Only 'lbfgs' and 'newton-cg' solvers are supported.", + ) + ] + ) if not _dal_ready: _patching_status.write_log() return lr_path_original( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, @@ -260,7 +285,7 @@ def __logistic_regression_path( max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio, - n_threads=n_threads + n_threads=n_threads, ) if isinstance(Cs, numbers.Integral): @@ -270,19 +295,19 @@ def __logistic_regression_path( # Preprocessing. if check_input: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): X = check_array( X, - accept_sparse='csr', + accept_sparse="csr", dtype=np.float64, accept_large_sparse=solver not in ["liblinear", "sag", "saga"], ) else: X = check_array( X, - accept_sparse='csr', + accept_sparse="csr", dtype=np.float64, - accept_large_sparse=solver != 'liblinear', + accept_large_sparse=solver != "liblinear", ) y = check_array(y, ensure_2d=False, dtype=None) check_consistent_length(X, y) @@ -292,45 +317,50 @@ def __logistic_regression_path( random_state = check_random_state(random_state) multi_class = _check_multi_class(multi_class, solver, len(classes)) - if pos_class is None and multi_class != 'multinomial': - if (classes.size > 2): - raise ValueError('To fit OvR, use the pos_class argument') + if pos_class is None and multi_class != "multinomial": + if classes.size > 2: + raise ValueError("To fit OvR, use the pos_class argument") # np.unique(y) gives labels in sorted order. pos_class = classes[1] - _dal_ready = _patching_status.and_conditions([ - (solver in ['lbfgs', 'newton-cg'], - f"'{solver}' solver is not supported. " - "Only 'lbfgs' and 'newton-cg' solvers are supported."), - (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), - (sample_weight is None, "Sample weights are not supported."), - (class_weight is None, "Class weights are not supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + solver in ["lbfgs", "newton-cg"], + f"'{solver}' solver is not supported. " + "Only 'lbfgs' and 'newton-cg' solvers are supported.", + ), + (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), + (sample_weight is None, "Sample weights are not supported."), + (class_weight is None, "Class weights are not supported."), + ] + ) if not _dal_ready: - if sklearn_check_version('0.24'): - sample_weight = _check_sample_weight(sample_weight, X, - dtype=X.dtype, - copy=True) + if sklearn_check_version("0.24"): + sample_weight = _check_sample_weight( + sample_weight, X, dtype=X.dtype, copy=True + ) else: - sample_weight = _check_sample_weight(sample_weight, X, - dtype=X.dtype) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) # If class_weights is a dict (provided by the user), the weights # are assigned to the original labels. If it is "balanced", then # the class_weights are assigned after masking the labels with a OvR. le = LabelEncoder() - if (isinstance(class_weight, dict) or multi_class == 'multinomial') and \ - not _dal_ready: + if ( + isinstance(class_weight, dict) or multi_class == "multinomial" + ) and not _dal_ready: class_weight_ = compute_class_weight(class_weight, classes=classes, y=y) if not np.allclose(class_weight_, np.ones_like(class_weight_)): sample_weight *= class_weight_[le.fit_transform(y)] # For doing a ovr, we need to mask the labels first. for the # multinomial case this is not necessary. - if multi_class == 'ovr': + if multi_class == "ovr": y_bin = np.ones(y.shape, dtype=X.dtype) - if sklearn_check_version('1.1'): - mask = (y == pos_class) + if sklearn_check_version("1.1"): + mask = y == pos_class y_bin = np.ones(y.shape, dtype=X.dtype) # for compute_class_weight @@ -344,24 +374,25 @@ def __logistic_regression_path( y_bin[~mask] = -1.0 else: mask_classes = np.array([-1, 1]) - mask = (y == pos_class) - y_bin[~mask] = -1. + mask = y == pos_class + y_bin[~mask] = -1.0 # for compute_class_weight if class_weight == "balanced" and not _dal_ready: - class_weight_ = compute_class_weight(class_weight, classes=mask_classes, - y=y_bin) + class_weight_ = compute_class_weight( + class_weight, classes=mask_classes, y=y_bin + ) if not np.allclose(class_weight_, np.ones_like(class_weight_)): sample_weight *= class_weight_[le.fit_transform(y_bin)] if _dal_ready: w0 = np.zeros(n_features + 1, dtype=X.dtype) - y_bin[~mask] = 0. + y_bin[~mask] = 0.0 else: w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype) else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): if solver in ["sag", "saga", "lbfgs", "newton-cg"]: # SAG, lbfgs and newton-cg multinomial solvers need LabelEncoder, # not LabelBinarizer, i.e. y as a 1d-array of integers. @@ -379,7 +410,7 @@ def __logistic_regression_path( if Y_multi.shape[1] == 1: Y_multi = np.hstack([1 - Y_multi, Y_multi]) else: - if solver not in ['sag', 'saga']: + if solver not in ["sag", "saga"]: if _dal_ready: Y_multi = le.fit_transform(y).astype(X.dtype, copy=False) else: @@ -393,24 +424,26 @@ def __logistic_regression_path( Y_multi = le.fit_transform(y).astype(X.dtype, copy=False) if _dal_ready: - w0 = np.zeros((classes.size, n_features + 1), - order='C', dtype=X.dtype) + w0 = np.zeros((classes.size, n_features + 1), order="C", dtype=X.dtype) else: - w0 = np.zeros((classes.size, n_features + int(fit_intercept)), - order='F', dtype=X.dtype) + w0 = np.zeros( + (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype + ) if coef is not None: # it must work both giving the bias term and not - if multi_class == 'ovr': + if multi_class == "ovr": if coef.size not in (n_features, w0.size): raise ValueError( - 'Initialization coef is of shape %d, expected shape ' - '%d or %d' % (coef.size, n_features, w0.size)) + "Initialization coef is of shape %d, expected shape " + "%d or %d" % (coef.size, n_features, w0.size) + ) if _dal_ready: - w0[-coef.size:] = \ + w0[-coef.size :] = ( np.roll(coef, 1, -1) if coef.size != n_features else coef + ) else: - w0[:coef.size] = coef + w0[: coef.size] = coef else: # For binary problems coef.shape[0] should be 1, otherwise it # should be classes.size. @@ -418,49 +451,59 @@ def __logistic_regression_path( if n_classes == 2: n_classes = 1 - if coef.shape[0] != n_classes or \ - coef.shape[1] not in (n_features, n_features + 1): + if coef.shape[0] != n_classes or coef.shape[1] not in ( + n_features, + n_features + 1, + ): raise ValueError( - 'Initialization coef is of shape (%d, %d), expected ' - 'shape (%d, %d) or (%d, %d)' % ( - coef.shape[0], coef.shape[1], classes.size, - n_features, classes.size, n_features + 1)) + "Initialization coef is of shape (%d, %d), expected " + "shape (%d, %d) or (%d, %d)" + % ( + coef.shape[0], + coef.shape[1], + classes.size, + n_features, + classes.size, + n_features + 1, + ) + ) if _dal_ready: - w0[:, -coef.shape[1]:] = \ + w0[:, -coef.shape[1] :] = ( np.roll(coef, 1, -1) if coef.shape[1] != n_features else coef + ) else: if n_classes == 1: - w0[0, :coef.shape[1]] = -coef - w0[1, :coef.shape[1]] = coef + w0[0, : coef.shape[1]] = -coef + w0[1, : coef.shape[1]] = coef else: - w0[:, :coef.shape[1]] = coef + w0[:, : coef.shape[1]] = coef C_daal_multiplier = 1 # commented out because this is Py3 feature - #def _map_to_binary_logistic_regression(): + # def _map_to_binary_logistic_regression(): # nonlocal C_daal_multiplier # nonlocal w0 # C_daal_multiplier = 2 # w0 *= 2 - if multi_class == 'multinomial': + if multi_class == "multinomial": # fmin_l_bfgs_b and newton-cg accepts only ravelled parameters. - if solver in ['lbfgs', 'newton-cg']: + if solver in ["lbfgs", "newton-cg"]: if _dal_ready and classes.size == 2: w0 = w0[-1:, :] - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): w0 = w0.ravel(order="F") else: w0 = w0.ravel() target = Y_multi loss = None - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): loss = LinearModelLoss( base_loss=HalfMultinomialLoss(n_classes=classes.size), fit_intercept=fit_intercept, ) - if solver == 'lbfgs': + if solver == "lbfgs": if _dal_ready: if classes.size == 2: # _map_to_binary_logistic_regression() @@ -471,12 +514,14 @@ def __logistic_regression_path( daal_extra_args_func = _daal4py_cross_entropy_loss_extra_args func = _daal4py_loss_and_grad else: - if sklearn_check_version('1.1') and loss is not None: + if sklearn_check_version("1.1") and loss is not None: func = loss.loss_gradient else: + def func(x, *args): return _multinomial_loss_grad(x, *args)[0:2] - elif solver == 'newton-cg': + + elif solver == "newton-cg": if _dal_ready: if classes.size == 2: # _map_to_binary_logistic_regression() @@ -489,40 +534,42 @@ def func(x, *args): grad = _daal4py_grad_ hess = _daal4py_grad_hess_ else: - if sklearn_check_version('1.1') and loss is not None: + if sklearn_check_version("1.1") and loss is not None: func = loss.loss grad = loss.gradient hess = loss.gradient_hessian_product # hess = [gradient, hessp] else: + def func(x, *args): return _multinomial_loss(x, *args)[0] def grad(x, *args): return _multinomial_loss_grad(x, *args)[1] + hess = _multinomial_grad_hess - warm_start_sag = {'coef': w0.T} + warm_start_sag = {"coef": w0.T} else: target = y_bin - if solver == 'lbfgs': + if solver == "lbfgs": if _dal_ready: func = _daal4py_loss_and_grad daal_extra_args_func = _daal4py_logistic_loss_extra_args else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): loss = LinearModelLoss( base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept ) func = loss.loss_gradient else: func = _logistic_loss_and_grad - elif solver == 'newton-cg': + elif solver == "newton-cg": if _dal_ready: daal_extra_args_func = _daal4py_logistic_loss_extra_args func = _daal4py_loss_ grad = _daal4py_grad_ hess = _daal4py_grad_hess_ else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): loss = LinearModelLoss( base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept ) @@ -534,90 +581,105 @@ def grad(x, *args): def grad(x, *args): return _logistic_loss_and_grad(x, *args)[1] + hess = _logistic_grad_hess - warm_start_sag = {'coef': np.expand_dims(w0, axis=1)} + warm_start_sag = {"coef": np.expand_dims(w0, axis=1)} coefs = list() n_iter = np.zeros(len(Cs), dtype=np.int32) for i, C in enumerate(Cs): - if solver == 'lbfgs': + if solver == "lbfgs": if _dal_ready: extra_args = daal_extra_args_func( classes.size, w0, X, target, - 0., - 1. / (2 * C * C_daal_multiplier), + 0.0, + 1.0 / (2 * C * C_daal_multiplier), fit_intercept, value=True, gradient=True, - hessian=False + hessian=False, ) else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): l2_reg_strength = 1.0 / C extra_args = (X, target, sample_weight, l2_reg_strength, n_threads) else: - extra_args = (X, target, 1. / C, sample_weight) + extra_args = (X, target, 1.0 / C, sample_weight) iprint = [-1, 50, 1, 100, 101][ - np.searchsorted(np.array([0, 1, 2, 3]), verbose)] + np.searchsorted(np.array([0, 1, 2, 3]), verbose) + ] opt_res = optimize.minimize( func, w0, method="L-BFGS-B", jac=True, args=extra_args, - options={"iprint": iprint, "gtol": tol, "maxiter": max_iter} + options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}, ) n_iter_i = _check_optimize_result( solver, opt_res, max_iter, - extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG) + extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG, + ) w0, loss = opt_res.x, opt_res.fun if _dal_ready and C_daal_multiplier == 2: w0 /= 2 - elif solver == 'newton-cg': + elif solver == "newton-cg": if _dal_ready: + def make_ncg_funcs(f, value=False, gradient=False, hessian=False): - daal_penaltyL2 = 1. / (2 * C * C_daal_multiplier) + daal_penaltyL2 = 1.0 / (2 * C * C_daal_multiplier) _obj_, X_, y_, n_samples = daal_extra_args_func( classes.size, w0, X, target, - 0., + 0.0, daal_penaltyL2, fit_intercept, value=value, gradient=gradient, - hessian=hessian + hessian=hessian, ) def _func_(x, *args): return f(x, _obj_, *args) + return _func_, (X_, y_, n_samples, daal_penaltyL2) loss_func, extra_args = make_ncg_funcs(func, value=True) grad_func, _ = make_ncg_funcs(grad, gradient=True) grad_hess_func, _ = make_ncg_funcs(hess, gradient=True) - w0, n_iter_i = _newton_cg(grad_hess_func, loss_func, grad_func, - w0, args=extra_args, - maxiter=max_iter, tol=tol) + w0, n_iter_i = _newton_cg( + grad_hess_func, + loss_func, + grad_func, + w0, + args=extra_args, + maxiter=max_iter, + tol=tol, + ) else: - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): l2_reg_strength = 1.0 / C args = (X, target, sample_weight, l2_reg_strength, n_threads) else: - args = (X, target, 1. / C, sample_weight) + args = (X, target, 1.0 / C, sample_weight) w0, n_iter_i = _newton_cg( hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol ) - elif solver == 'liblinear': - coef_, intercept_, n_iter_i, = _fit_liblinear( + elif solver == "liblinear": + ( + coef_, + intercept_, + n_iter_i, + ) = _fit_liblinear( X, target, C, @@ -637,22 +699,22 @@ def _func_(x, *args): else: w0 = coef_.ravel() - elif solver in ['sag', 'saga']: - if multi_class == 'multinomial': + elif solver in ["sag", "saga"]: + if multi_class == "multinomial": target = target.astype(X.dtype, copy=False) - loss = 'multinomial' + loss = "multinomial" else: - loss = 'log' + loss = "log" # alpha is for L2-norm, beta is for L1-norm - if penalty == 'l1': - alpha = 0. - beta = 1. / C - elif penalty == 'l2': - alpha = 1. / C - beta = 0. + if penalty == "l1": + alpha = 0.0 + beta = 1.0 / C + elif penalty == "l2": + alpha = 1.0 / C + beta = 0.0 else: # Elastic-Net penalty - alpha = (1. / C) * (1 - l1_ratio) - beta = (1. / C) * l1_ratio + alpha = (1.0 / C) * (1 - l1_ratio) + beta = (1.0 / C) * l1_ratio w0, n_iter_i, warm_start_sag = sag_solver( X, @@ -668,7 +730,7 @@ def _func_(x, *args): False, max_squared_sum, warm_start_sag, - is_saga=(solver == 'saga') + is_saga=(solver == "saga"), ) else: @@ -677,7 +739,7 @@ def _func_(x, *args): "'newton-cg', 'sag'}, got '%s' instead" % solver ) - if multi_class == 'multinomial': + if multi_class == "multinomial": if _dal_ready: if classes.size == 2: multi_w0 = w0[np.newaxis, :] @@ -685,7 +747,7 @@ def _func_(x, *args): multi_w0 = np.reshape(w0, (classes.size, -1)) else: n_classes = max(2, classes.size) - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): if solver in ["lbfgs", "newton-cg"]: multi_w0 = np.reshape(w0, (n_classes, -1), order="F") else: @@ -715,82 +777,102 @@ def _func_(x, *args): def daal4py_predict(self, X, resultsToEvaluate): check_is_fitted(self) - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) try: fptype = getFPType(X) except ValueError: fptype = None - if resultsToEvaluate == 'computeClassLabels': - _function_name = 'predict' - elif resultsToEvaluate == 'computeClassProbabilities': - _function_name = 'predict_proba' - elif resultsToEvaluate == 'computeClassLogProbabilities': - _function_name = 'predict_log_proba' + if resultsToEvaluate == "computeClassLabels": + _function_name = "predict" + elif resultsToEvaluate == "computeClassProbabilities": + _function_name = "predict_proba" + elif resultsToEvaluate == "computeClassLogProbabilities": + _function_name = "predict_log_proba" else: - raise ValueError('resultsToEvaluate must be in [computeClassLabels, \ - computeClassProbabilities, computeClassLogProbabilities]') + raise ValueError( + "resultsToEvaluate must be in [computeClassLabels, \ + computeClassProbabilities, computeClassLogProbabilities]" + ) _patching_status = PatchingConditionsChain( - f"sklearn.linear_model.LogisticRegression.{_function_name}") - _patching_status.and_conditions([ - (self.multi_class in ["multinomial", "warn"], - f"{self.multi_class} multiclass option is not supported. " - "Only 'multinomial' or 'warn' options are supported."), - (self.classes_.size == 2, "Number of classes != 2."), - (resultsToEvaluate == 'computeClassLabels', - "resultsToEvaluate != 'computeClassLabels'.")], - conditions_merging=any) - _dal_ready = _patching_status.and_conditions([ - (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), - (not sparse.issparse(self.coef_), - "self.coef_ is sparse. Sparse coefficients are not supported."), - (fptype is not None, "Unable to get dtype.")]) + f"sklearn.linear_model.LogisticRegression.{_function_name}" + ) + _patching_status.and_conditions( + [ + ( + self.multi_class in ["multinomial", "warn"], + f"{self.multi_class} multiclass option is not supported. " + "Only 'multinomial' or 'warn' options are supported.", + ), + (self.classes_.size == 2, "Number of classes != 2."), + ( + resultsToEvaluate == "computeClassLabels", + "resultsToEvaluate != 'computeClassLabels'.", + ), + ], + conditions_merging=any, + ) + _dal_ready = _patching_status.and_conditions( + [ + (not sparse.issparse(X), "X is sparse. Sparse input is not supported."), + ( + not sparse.issparse(self.coef_), + "self.coef_ is sparse. Sparse coefficients are not supported.", + ), + (fptype is not None, "Unable to get dtype."), + ] + ) _patching_status.write_log() if _dal_ready: n_features = self.coef_.shape[1] if X.shape[1] != n_features: raise ValueError( - f'X has {X.shape[1]} features, ' - f'but LogisticRegression is expecting {n_features} features as input' + f"X has {X.shape[1]} features, " + f"but LogisticRegression is expecting {n_features} features as input" ) builder = d4p.logistic_regression_model_builder(X.shape[1], len(self.classes_)) builder.set_beta(self.coef_, self.intercept_) predict = d4p.logistic_regression_prediction( nClasses=len(self.classes_), fptype=fptype, - method='defaultDense', - resultsToEvaluate=resultsToEvaluate + method="defaultDense", + resultsToEvaluate=resultsToEvaluate, ) res = predict.compute(X, builder.model) - if resultsToEvaluate == 'computeClassLabels': + if resultsToEvaluate == "computeClassLabels": res = res.prediction - if not np.array_equal(self.classes_, np.arange(0, len(self.classes_))) or \ - self.classes_.dtype != X.dtype: + if ( + not np.array_equal(self.classes_, np.arange(0, len(self.classes_))) + or self.classes_.dtype != X.dtype + ): res = self.classes_.take(np.asarray(res, dtype=np.intp)) - elif resultsToEvaluate == 'computeClassProbabilities': + elif resultsToEvaluate == "computeClassProbabilities": res = res.probabilities - elif resultsToEvaluate == 'computeClassLogProbabilities': + elif resultsToEvaluate == "computeClassLogProbabilities": res = res.logProbabilities else: - raise ValueError('resultsToEvaluate must be in [computeClassLabels, \ - computeClassProbabilities, computeClassLogProbabilities]') + raise ValueError( + "resultsToEvaluate must be in [computeClassLabels, \ + computeClassProbabilities, computeClassLogProbabilities]" + ) if res.shape[1] == 1: res = np.ravel(res) return res - if resultsToEvaluate == 'computeClassLabels': + if resultsToEvaluate == "computeClassLabels": return LogisticRegression_original.predict(self, X) - if resultsToEvaluate == 'computeClassProbabilities': + if resultsToEvaluate == "computeClassProbabilities": return LogisticRegression_original.predict_proba(self, X) - if resultsToEvaluate == 'computeClassLogProbabilities': + if resultsToEvaluate == "computeClassLogProbabilities": return LogisticRegression_original.predict_log_proba(self, X) -if sklearn_check_version('0.24'): +if sklearn_check_version("0.24"): + @support_usm_ndarray() def logistic_regression_path( X, @@ -801,13 +883,13 @@ def logistic_regression_path( max_iter=100, tol=1e-4, verbose=0, - solver='lbfgs', + solver="lbfgs", coef=None, class_weight=None, dual=False, - penalty='l2', - intercept_scaling=1., - multi_class='auto', + penalty="l2", + intercept_scaling=1.0, + multi_class="auto", random_state=None, check_input=True, max_squared_sum=None, @@ -815,14 +897,21 @@ def logistic_regression_path( l1_ratio=None, n_threads=1, ): - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): return __logistic_regression_path( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, @@ -830,35 +919,42 @@ def logistic_regression_path( max_squared_sum=max_squared_sum, sample_weight=sample_weight, l1_ratio=l1_ratio, - n_threads=n_threads + n_threads=n_threads, ) return __logistic_regression_path( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, check_input=check_input, max_squared_sum=max_squared_sum, sample_weight=sample_weight, - l1_ratio=l1_ratio + l1_ratio=l1_ratio, ) class LogisticRegression(LogisticRegression_original): __doc__ = LogisticRegression_original.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **LogisticRegression_original._parameter_constraints } def __init__( self, - penalty='l2', + penalty="l2", dual=False, tol=1e-4, C=1.0, @@ -866,13 +962,13 @@ def __init__( intercept_scaling=1, class_weight=None, random_state=None, - solver='lbfgs', + solver="lbfgs", max_iter=100, - multi_class='auto', + multi_class="auto", verbose=0, warm_start=False, n_jobs=None, - l1_ratio=None + l1_ratio=None, ): self.penalty = penalty self.dual = dual @@ -920,11 +1016,11 @@ def fit(self, X, y, sample_weight=None): ----- The SAGA solver supports both float64 and float32 bit arrays. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - which, what = logistic_module, '_logistic_regression_path' + which, what = logistic_module, "_logistic_regression_path" replacer = logistic_regression_path descriptor = getattr(which, what, None) setattr(which, what, replacer) @@ -947,7 +1043,7 @@ def predict(self, X): C : array, shape [n_samples] Predicted class label per sample. """ - return daal4py_predict(self, X, 'computeClassLabels') + return daal4py_predict(self, X, "computeClassLabels") @support_usm_ndarray() def predict_log_proba(self, X): @@ -969,7 +1065,7 @@ def predict_log_proba(self, X): Returns the log-probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassLogProbabilities') + return daal4py_predict(self, X, "computeClassLogProbabilities") @support_usm_ndarray() def predict_proba(self, X): @@ -998,10 +1094,10 @@ def predict_proba(self, X): Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassProbabilities') - + return daal4py_predict(self, X, "computeClassProbabilities") else: + @support_usm_ndarray() def logistic_regression_path( X, @@ -1012,13 +1108,13 @@ def logistic_regression_path( max_iter=100, tol=1e-4, verbose=0, - solver='lbfgs', + solver="lbfgs", coef=None, class_weight=None, dual=False, - penalty='l2', - intercept_scaling=1., - multi_class='auto', + penalty="l2", + intercept_scaling=1.0, + multi_class="auto", random_state=None, check_input=True, max_squared_sum=None, @@ -1026,19 +1122,26 @@ def logistic_regression_path( l1_ratio=None, ): return __logistic_regression_path( - X, y, pos_class=pos_class, - Cs=Cs, fit_intercept=fit_intercept, - max_iter=max_iter, tol=tol, verbose=verbose, - solver=solver, coef=coef, + X, + y, + pos_class=pos_class, + Cs=Cs, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + solver=solver, + coef=coef, class_weight=class_weight, - dual=dual, penalty=penalty, + dual=dual, + penalty=penalty, intercept_scaling=intercept_scaling, multi_class=multi_class, random_state=random_state, check_input=check_input, max_squared_sum=max_squared_sum, sample_weight=sample_weight, - l1_ratio=l1_ratio + l1_ratio=l1_ratio, ) class LogisticRegression(LogisticRegression_original): @@ -1046,7 +1149,7 @@ class LogisticRegression(LogisticRegression_original): def __init__( self, - penalty='l2', + penalty="l2", dual=False, tol=1e-4, C=1.0, @@ -1054,15 +1157,14 @@ def __init__( intercept_scaling=1, class_weight=None, random_state=None, - solver='lbfgs', + solver="lbfgs", max_iter=100, - multi_class='auto', + multi_class="auto", verbose=0, warm_start=False, n_jobs=None, l1_ratio=None, ): - self.penalty = penalty self.dual = dual self.tol = tol @@ -1109,7 +1211,7 @@ def fit(self, X, y, sample_weight=None): ----- The SAGA solver supports both float64 and float32 bit arrays. """ - which, what = logistic_module, '_logistic_regression_path' + which, what = logistic_module, "_logistic_regression_path" replacer = logistic_regression_path descriptor = getattr(which, what, None) setattr(which, what, replacer) @@ -1132,7 +1234,7 @@ def predict(self, X): C : array, shape [n_samples] Predicted class label per sample. """ - return daal4py_predict(self, X, 'computeClassLabels') + return daal4py_predict(self, X, "computeClassLabels") @support_usm_ndarray() def predict_log_proba(self, X): @@ -1154,7 +1256,7 @@ def predict_log_proba(self, X): Returns the log-probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassLogProbabilities') + return daal4py_predict(self, X, "computeClassLogProbabilities") @support_usm_ndarray() def predict_proba(self, X): @@ -1183,4 +1285,4 @@ def predict_proba(self, X): Returns the probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ - return daal4py_predict(self, X, 'computeClassProbabilities') + return daal4py_predict(self, X, "computeClassProbabilities") diff --git a/daal4py/sklearn/linear_model/ridge.py b/daal4py/sklearn/linear_model/ridge.py index 11df4b1aed..1d96dc5fa7 100644 --- a/daal4py/sklearn/linear_model/ridge.py +++ b/daal4py/sklearn/linear_model/ridge.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._ridge import * diff --git a/daal4py/sklearn/linear_model/tests/test_linear.py b/daal4py/sklearn/linear_model/tests/test_linear.py index 57e34e52ed..34ca62f37e 100644 --- a/daal4py/sklearn/linear_model/tests/test_linear.py +++ b/daal4py/sklearn/linear_model/tests/test_linear.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,18 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.linear_model import LinearRegression +import pytest from sklearn.datasets import make_regression +from sklearn.linear_model import LinearRegression +from sklearn.utils._testing import assert_array_almost_equal def make_dataset(n_samples, n_features, kind=np.array, random_state=0, types=None): try: from pandas import DataFrame + if kind not in (list, np.array, DataFrame): kind = np.array except ImportError: @@ -62,7 +63,7 @@ def make_dataset(n_samples, n_features, kind=np.array, random_state=0, types=Non def test_linear_array_vs_dataframe_homogen(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") x_train, y_train = make_dataset(100, 20) x_test, _ = make_dataset(100, 20, random_state=1) @@ -77,15 +78,17 @@ def test_linear_array_vs_dataframe_homogen(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) def test_linear_array_vs_dataframe_heterogen(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float64, np.float32) @@ -102,15 +105,18 @@ def test_linear_array_vs_dataframe_heterogen(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1)), decimal=5) + df_reg.predict(df_x_test).reshape((-1, 1)), + decimal=5, + ) def test_linear_array_vs_dataframe_heterogen_double_float(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float64, np.float32) @@ -127,15 +133,17 @@ def test_linear_array_vs_dataframe_heterogen_double_float(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) def test_linear_array_vs_dataframe_heterogen_double_int(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float64, np.int32) @@ -152,15 +160,17 @@ def test_linear_array_vs_dataframe_heterogen_double_int(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) def test_linear_array_vs_dataframe_heterogen_float_int(): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") types = (np.float32, np.int32) @@ -177,8 +187,10 @@ def test_linear_array_vs_dataframe_heterogen_float_int(): df_reg.fit(df_x_train, df_y_train) assert_array_almost_equal( - array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1))) + array_reg.coef_.reshape((-1, 1)), df_reg.coef_.reshape((-1, 1)) + ) assert_array_almost_equal(array_reg.intercept_, df_reg.intercept_) assert_array_almost_equal( array_reg.predict(x_test).reshape((-1, 1)), - df_reg.predict(df_x_test).reshape((-1, 1))) + df_reg.predict(df_x_test).reshape((-1, 1)), + ) diff --git a/daal4py/sklearn/manifold/__init__.py b/daal4py/sklearn/manifold/__init__.py index c2e3047cbf..9ec5be77fc 100644 --- a/daal4py/sklearn/manifold/__init__.py +++ b/daal4py/sklearn/manifold/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._t_sne import TSNE -__all__ = ['TSNE'] +__all__ = ["TSNE"] diff --git a/daal4py/sklearn/manifold/_t_sne.py b/daal4py/sklearn/manifold/_t_sne.py index bf349431c3..e3fa6f07c2 100755 --- a/daal4py/sklearn/manifold/_t_sne.py +++ b/daal4py/sklearn/manifold/_t_sne.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,33 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py TSNE scikit-learn-compatible class import warnings from time import time + import numpy as np from scipy.sparse import issparse -import daal4py -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, PatchingConditionsChain) - -from sklearn.manifold import TSNE as BaseTSNE from sklearn.decomposition import PCA +from sklearn.manifold import TSNE as BaseTSNE from sklearn.metrics.pairwise import pairwise_distances +from sklearn.utils import check_array, check_random_state from sklearn.utils.validation import check_non_negative -from sklearn.utils import check_random_state, check_array -from ..neighbors import NearestNeighbors +import daal4py +from daal4py.sklearn._utils import ( + PatchingConditionsChain, + daal_check_version, + sklearn_check_version, +) + from .._device_offload import support_usm_ndarray +from ..neighbors import NearestNeighbors -if sklearn_check_version('0.22'): - from sklearn.manifold._t_sne import _joint_probabilities - from sklearn.manifold._t_sne import _joint_probabilities_nn +if sklearn_check_version("0.22"): + from sklearn.manifold._t_sne import _joint_probabilities, _joint_probabilities_nn else: - from sklearn.manifold.t_sne import _joint_probabilities - from sklearn.manifold.t_sne import _joint_probabilities_nn + from sklearn.manifold.t_sne import _joint_probabilities, _joint_probabilities_nn class TSNE(BaseTSNE): @@ -101,39 +103,33 @@ def _daal_tsne(self, P, n_samples, X_embedded): # * final optimization with momentum at 0.8 # N, nnz, n_iter_without_progress, n_iter - size_iter = [[n_samples], [P.nnz], - [self.n_iter_without_progress], - [self.n_iter]] + size_iter = [[n_samples], [P.nnz], [self.n_iter_without_progress], [self.n_iter]] # Pass params to daal4py backend - if daal_check_version((2023, 'P', 1)): - size_iter.extend( - [[self._EXPLORATION_N_ITER], - [self._N_ITER_CHECK]] - ) + if daal_check_version((2023, "P", 1)): + size_iter.extend([[self._EXPLORATION_N_ITER], [self._N_ITER_CHECK]]) size_iter = np.array(size_iter, dtype=P.dtype) - params = np.array([[self.early_exaggeration], [self._learning_rate], - [self.min_grad_norm], [self.angle]], dtype=P.dtype) + params = np.array( + [ + [self.early_exaggeration], + [self._learning_rate], + [self.min_grad_norm], + [self.angle], + ], + dtype=P.dtype, + ) results = np.zeros((3, 1), dtype=P.dtype) # curIter, error, gradNorm if P.dtype == np.float64: daal4py.daal_tsne_gradient_descent( - X_embedded, - P, - size_iter, - params, - results, - 0) + X_embedded, P, size_iter, params, results, 0 + ) elif P.dtype == np.float32: daal4py.daal_tsne_gradient_descent( - X_embedded, - P, - size_iter, - params, - results, - 1) + X_embedded, P, size_iter, params, results, 1 + ) else: raise ValueError("unsupported dtype of 'P' matrix") @@ -147,40 +143,49 @@ def _daal_tsne(self, P, n_samples, X_embedded): def _fit(self, X, skip_num_points=0): """Private function to fit the model using X as training data.""" - if isinstance(self.init, str) and self.init == 'warn': - warnings.warn("The default initialization in TSNE will change " - "from 'random' to 'pca' in 1.2.", FutureWarning) - self._init = 'random' + if isinstance(self.init, str) and self.init == "warn": + warnings.warn( + "The default initialization in TSNE will change " + "from 'random' to 'pca' in 1.2.", + FutureWarning, + ) + self._init = "random" else: self._init = self.init - if isinstance(self._init, str) and self._init == 'pca' and issparse(X): - raise TypeError("PCA initialization is currently not suported " - "with the sparse input matrix. Use " - "init=\"random\" instead.") + if isinstance(self._init, str) and self._init == "pca" and issparse(X): + raise TypeError( + "PCA initialization is currently not suported " + "with the sparse input matrix. Use " + 'init="random" instead.' + ) - if self.method not in ['barnes_hut', 'exact']: + if self.method not in ["barnes_hut", "exact"]: raise ValueError("'method' must be 'barnes_hut' or 'exact'") if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") - if self.learning_rate == 'warn': - warnings.warn("The default learning rate in TSNE will change " - "from 200.0 to 'auto' in 1.2.", FutureWarning) + if self.learning_rate == "warn": + warnings.warn( + "The default learning rate in TSNE will change " + "from 200.0 to 'auto' in 1.2.", + FutureWarning, + ) self._learning_rate = 200.0 else: self._learning_rate = self.learning_rate - if self._learning_rate == 'auto': + if self._learning_rate == "auto": self._learning_rate = X.shape[0] / self.early_exaggeration / 4 self._learning_rate = np.maximum(self._learning_rate, 50) else: if not (self._learning_rate > 0): - raise ValueError("'learning_rate' must be a positive number " - "or 'auto'.") + raise ValueError( + "'learning_rate' must be a positive number " "or 'auto'." + ) # rename attribute for compatibility with sklearn>=1.2 - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self.learning_rate_ = self._learning_rate - if hasattr(self, 'square_distances'): + if hasattr(self, "square_distances"): if sklearn_check_version("1.1"): if self.square_distances != "deprecated": warnings.warn( @@ -190,8 +195,7 @@ def _fit(self, X, skip_num_points=0): ) else: if self.square_distances not in [True, "legacy"]: - raise ValueError( - "'square_distances' must be True or 'legacy'.") + raise ValueError("'square_distances' must be True or 'legacy'.") if self.metric != "euclidean" and self.square_distances is not True: warnings.warn( "'square_distances' has been introduced in 0.24 to help phase " @@ -204,47 +208,67 @@ def _fit(self, X, skip_num_points=0): FutureWarning, ) - if self.method == 'barnes_hut': - if sklearn_check_version('0.23'): - X = self._validate_data(X, accept_sparse=['csr'], - ensure_min_samples=2, - dtype=[np.float32, np.float64]) + if self.method == "barnes_hut": + if sklearn_check_version("0.23"): + X = self._validate_data( + X, + accept_sparse=["csr"], + ensure_min_samples=2, + dtype=[np.float32, np.float64], + ) else: - X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2, - dtype=[np.float32, np.float64]) + X = check_array( + X, + accept_sparse=["csr"], + ensure_min_samples=2, + dtype=[np.float32, np.float64], + ) else: - if sklearn_check_version('0.23'): - X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + if sklearn_check_version("0.23"): + X = self._validate_data( + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float32, np.float64] + ) else: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + X = check_array( + X, accept_sparse=["csr", "csc", "coo"], dtype=[np.float32, np.float64] + ) if self.metric == "precomputed": - if isinstance(self._init, str) and self._init == 'pca': - raise ValueError("The parameter init=\"pca\" cannot be " - "used with metric=\"precomputed\".") + if isinstance(self._init, str) and self._init == "pca": + raise ValueError( + 'The parameter init="pca" cannot be ' + 'used with metric="precomputed".' + ) if X.shape[0] != X.shape[1]: raise ValueError("X should be a square distance matrix") - check_non_negative(X, "TSNE.fit(). With metric='precomputed', X " - "should contain positive distances.") + check_non_negative( + X, + "TSNE.fit(). With metric='precomputed', X " + "should contain positive distances.", + ) if self.method == "exact" and issparse(X): raise TypeError( 'TSNE with method="exact" does not accept sparse ' 'precomputed distance matrix. Use method="barnes_hut" ' - 'or provide the dense distance matrix.') + "or provide the dense distance matrix." + ) - if self.method == 'barnes_hut' and self.n_components > 3: - raise ValueError("'n_components' should be inferior to 4 for the " - "barnes_hut algorithm as it relies on " - "quad-tree or oct-tree.") + if self.method == "barnes_hut" and self.n_components > 3: + raise ValueError( + "'n_components' should be inferior to 4 for the " + "barnes_hut algorithm as it relies on " + "quad-tree or oct-tree." + ) random_state = check_random_state(self.random_state) if self.early_exaggeration < 1.0: - raise ValueError("early_exaggeration must be at least 1, but is {}" - .format(self.early_exaggeration)) + raise ValueError( + "early_exaggeration must be at least 1, but is {}".format( + self.early_exaggeration + ) + ) if self.n_iter < 250: raise ValueError("n_iter should be at least 250") @@ -267,78 +291,85 @@ def _fit(self, X, skip_num_points=0): # squared distances, and returns np.sqrt(dist) for # squared=False. # Also, Euclidean is slower for n_jobs>1, so don't set here - distances = pairwise_distances(X, metric=self.metric, - squared=True) + distances = pairwise_distances(X, metric=self.metric, squared=True) else: metric_params_ = {} - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): metric_params_ = self.metric_params or {} - distances = pairwise_distances(X, metric=self.metric, - n_jobs=self.n_jobs, - **metric_params_) + distances = pairwise_distances( + X, metric=self.metric, n_jobs=self.n_jobs, **metric_params_ + ) if np.any(distances < 0): - raise ValueError("All distances should be positive, the " - "metric given is not correct") + raise ValueError( + "All distances should be positive, the " "metric given is not correct" + ) - if self.metric != "euclidean" and \ - getattr(self, 'square_distances', True) is True: + if ( + self.metric != "euclidean" + and getattr(self, "square_distances", True) is True + ): distances **= 2 # compute the joint probability distribution for the input space P = _joint_probabilities(distances, self.perplexity, self.verbose) assert np.all(np.isfinite(P)), "All probabilities should be finite" assert np.all(P >= 0), "All probabilities should be non-negative" - assert np.all(P <= 1), ("All probabilities should be less " - "or then equal to one") + assert np.all(P <= 1), ( + "All probabilities should be less " "or then equal to one" + ) else: # Compute the number of nearest neighbors to find. # LvdM uses 3 * perplexity as the number of neighbors. # In the event that we have very small # of points # set the neighbors to n - 1. - n_neighbors = min(n_samples - 1, int(3. * self.perplexity + 1)) + n_neighbors = min(n_samples - 1, int(3.0 * self.perplexity + 1)) if self.verbose: - print("[t-SNE] Computing {} nearest neighbors..." - .format(n_neighbors)) + print("[t-SNE] Computing {} nearest neighbors...".format(n_neighbors)) # Find the nearest neighbors for every point knn = None if sklearn_check_version("1.1"): knn = NearestNeighbors( - algorithm='auto', + algorithm="auto", n_jobs=self.n_jobs, n_neighbors=n_neighbors, metric=self.metric, - metric_params=self.metric_params + metric_params=self.metric_params, ) else: knn = NearestNeighbors( - algorithm='auto', + algorithm="auto", n_jobs=self.n_jobs, n_neighbors=n_neighbors, - metric=self.metric + metric=self.metric, ) t0 = time() knn.fit(X) duration = time() - t0 if self.verbose: - print("[t-SNE] Indexed {} samples in {:.3f}s...".format( - n_samples, duration)) + print( + "[t-SNE] Indexed {} samples in {:.3f}s...".format(n_samples, duration) + ) t0 = time() - distances_nn = knn.kneighbors_graph(mode='distance') + distances_nn = knn.kneighbors_graph(mode="distance") duration = time() - t0 if self.verbose: - print("[t-SNE] Computed neighbors for {} samples " - "in {:.3f}s...".format(n_samples, duration)) + print( + "[t-SNE] Computed neighbors for {} samples " + "in {:.3f}s...".format(n_samples, duration) + ) # Free the memory used by the ball_tree del knn - if getattr(self, 'square_distances', True) is True or \ - self.metric == "euclidean": + if ( + getattr(self, "square_distances", True) is True + or self.metric == "euclidean" + ): # knn return the euclidean distance but we need it squared # to be consistent with the 'exact' method. Note that the # the method was derived using the euclidean method as in the @@ -347,30 +378,31 @@ def _fit(self, X, skip_num_points=0): distances_nn.data **= 2 # compute the joint probability distribution for the input space - P = _joint_probabilities_nn(distances_nn, self.perplexity, - self.verbose) + P = _joint_probabilities_nn(distances_nn, self.perplexity, self.verbose) if isinstance(self._init, np.ndarray): X_embedded = self._init - elif self._init == 'pca': + elif self._init == "pca": pca = PCA( n_components=self.n_components, - svd_solver='randomized', + svd_solver="randomized", random_state=random_state, ) X_embedded = pca.fit_transform(X).astype(np.float32, copy=False) - warnings.warn("The PCA initialization in TSNE will change to " - "have the standard deviation of PC1 equal to 1e-4 " - "in 1.2. This will ensure better convergence.", - FutureWarning) - elif self._init == 'random': + warnings.warn( + "The PCA initialization in TSNE will change to " + "have the standard deviation of PC1 equal to 1e-4 " + "in 1.2. This will ensure better convergence.", + FutureWarning, + ) + elif self._init == "random": # The embedding is initialized with iid samples from Gaussians with # standard deviation 1e-4. - X_embedded = 1e-4 * random_state.randn( - n_samples, self.n_components).astype(np.float32) + X_embedded = 1e-4 * random_state.randn(n_samples, self.n_components).astype( + np.float32 + ) else: - raise ValueError("'init' must be 'pca', 'random', or " - "a numpy array") + raise ValueError("'init' must be 'pca', 'random', or " "a numpy array") # Degrees of freedom of the Student's t-distribution. The suggestion # degrees_of_freedom = n_components - 1 comes from @@ -378,31 +410,31 @@ def _fit(self, X, skip_num_points=0): # Laurens van der Maaten, 2009. degrees_of_freedom = max(self.n_components - 1, 1) - _patching_status = PatchingConditionsChain( - "sklearn.manifold.TSNE._tsne") - _patching_status.and_conditions([ - (self.method == 'barnes_hut', - 'Used t-SNE method is not "barnes_hut" which is the only supported.'), - (self.n_components == 2, 'Number of components != 2.'), - (self.verbose == 0, 'Verbose mode is set.'), - (daal_check_version((2021, 'P', 600)), - 'oneDAL version is lower than 2021.6.') - ]) + _patching_status = PatchingConditionsChain("sklearn.manifold.TSNE._tsne") + _patching_status.and_conditions( + [ + ( + self.method == "barnes_hut", + 'Used t-SNE method is not "barnes_hut" which is the only supported.', + ), + (self.n_components == 2, "Number of components != 2."), + (self.verbose == 0, "Verbose mode is set."), + ( + daal_check_version((2021, "P", 600)), + "oneDAL version is lower than 2021.6.", + ), + ] + ) _dal_ready = _patching_status.get_status(logs=True) if _dal_ready: - X_embedded = check_array( - X_embedded, dtype=[np.float32, np.float64]) - return self._daal_tsne( - P, - n_samples, - X_embedded=X_embedded - ) + X_embedded = check_array(X_embedded, dtype=[np.float32, np.float64]) + return self._daal_tsne(P, n_samples, X_embedded=X_embedded) return self._tsne( P, degrees_of_freedom, n_samples, X_embedded=X_embedded, neighbors=neighbors_nn, - skip_num_points=skip_num_points + skip_num_points=skip_num_points, ) diff --git a/daal4py/sklearn/metrics/__init__.py b/daal4py/sklearn/metrics/__init__.py index 7695eb680c..3975869648 100644 --- a/daal4py/sklearn/metrics/__init__.py +++ b/daal4py/sklearn/metrics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,9 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ._ranking import roc_auc_score from ._pairwise import pairwise_distances +from ._ranking import roc_auc_score -__all__ = ['roc_auc_score', 'pairwise_distances'] +__all__ = ["roc_auc_score", "pairwise_distances"] diff --git a/daal4py/sklearn/metrics/_pairwise.py b/daal4py/sklearn/metrics/_pairwise.py index a4222564e2..5db848fc9a 100755 --- a/daal4py/sklearn/metrics/_pairwise.py +++ b/daal4py/sklearn/metrics/_pairwise.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,54 +12,63 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np +import warnings from functools import partial -from sklearn.metrics.pairwise import _parallel_pairwise, _pairwise_callable -from sklearn.metrics.pairwise import _VALID_METRICS, PAIRWISE_DISTANCE_FUNCTIONS -from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS -from sklearn.metrics.pairwise import check_pairwise_arrays + +import numpy as np +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics.pairwise import ( + _VALID_METRICS, + PAIRWISE_BOOLEAN_FUNCTIONS, + PAIRWISE_DISTANCE_FUNCTIONS, + _pairwise_callable, + _parallel_pairwise, + check_pairwise_arrays, +) from sklearn.utils._joblib import effective_n_jobs from sklearn.utils.validation import check_non_negative -import warnings -from sklearn.exceptions import DataConversionWarning + try: from sklearn.metrics.pairwise import _precompute_metric_params except ImportError: + def _precompute_metric_params(*args, **kwrds): return dict() + from scipy.sparse import issparse from scipy.spatial import distance import daal4py from daal4py.sklearn.utils.validation import _daal_check_array -from .._utils import (getFPType, PatchingConditionsChain, sklearn_check_version) + from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version -if sklearn_check_version('1.3'): - from sklearn.utils._param_validation import ( - validate_params, Integral, StrOptions) +if sklearn_check_version("1.3"): + from sklearn.utils._param_validation import Integral, StrOptions, validate_params def _daal4py_cosine_distance_dense(X): X_fptype = getFPType(X) - alg = daal4py.cosine_distance(fptype=X_fptype, method='defaultDense') + alg = daal4py.cosine_distance(fptype=X_fptype, method="defaultDense") res = alg.compute(X) return res.cosineDistance def _daal4py_correlation_distance_dense(X): X_fptype = getFPType(X) - alg = daal4py.correlation_distance(fptype=X_fptype, method='defaultDense') + alg = daal4py.correlation_distance(fptype=X_fptype, method="defaultDense") res = alg.compute(X) return res.correlationDistance @support_usm_ndarray(freefunc=True) -def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, - force_all_finite=True, **kwds): +def pairwise_distances( + X, Y=None, metric="euclidean", n_jobs=None, force_all_finite=True, **kwds +): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns @@ -159,46 +168,57 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, elements of two arrays """ if metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed": - raise ValueError("Unknown metric %s. Valid metrics are %s, or 'precomputed', " - "or a callable" % (metric, _VALID_METRICS)) - - X = _daal_check_array(X, accept_sparse=['csr', 'csc', 'coo'], - force_all_finite=force_all_finite) - - _patching_status = PatchingConditionsChain( - "sklearn.metrics.pairwise_distances") - _dal_ready = _patching_status.and_conditions([ - (metric == 'cosine' or metric == 'correlation', - f"'{metric}' metric is not supported. " - "Only 'cosine' and 'correlation' metrics are supported."), - (Y is None, "Second feature array is not supported."), - (not issparse(X), "X is sparse. Sparse input is not supported."), - (X.dtype == np.float64, - f"{X.dtype} X data type is not supported. Only np.float64 is supported.") - ]) + raise ValueError( + "Unknown metric %s. Valid metrics are %s, or 'precomputed', " + "or a callable" % (metric, _VALID_METRICS) + ) + + X = _daal_check_array( + X, accept_sparse=["csr", "csc", "coo"], force_all_finite=force_all_finite + ) + + _patching_status = PatchingConditionsChain("sklearn.metrics.pairwise_distances") + _dal_ready = _patching_status.and_conditions( + [ + ( + metric == "cosine" or metric == "correlation", + f"'{metric}' metric is not supported. " + "Only 'cosine' and 'correlation' metrics are supported.", + ), + (Y is None, "Second feature array is not supported."), + (not issparse(X), "X is sparse. Sparse input is not supported."), + ( + X.dtype == np.float64, + f"{X.dtype} X data type is not supported. Only np.float64 is supported.", + ), + ] + ) _patching_status.write_log() if _dal_ready: - if metric == 'cosine': + if metric == "cosine": return _daal4py_cosine_distance_dense(X) - if metric == 'correlation': + if metric == "correlation": return _daal4py_correlation_distance_dense(X) raise ValueError(f"'{metric}' distance is wrong for daal4py.") if metric == "precomputed": - X, _ = check_pairwise_arrays(X, Y, precomputed=True, - force_all_finite=force_all_finite) - whom = ("`pairwise_distances`. Precomputed distance " - " need to have non-negative values.") + X, _ = check_pairwise_arrays( + X, Y, precomputed=True, force_all_finite=force_all_finite + ) + whom = ( + "`pairwise_distances`. Precomputed distance " + " need to have non-negative values." + ) check_non_negative(X, whom=whom) return X if metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): - func = partial(_pairwise_callable, metric=metric, - force_all_finite=force_all_finite, **kwds) + func = partial( + _pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds + ) else: if issparse(X) or issparse(Y): - raise TypeError("scipy distance metrics do not" - " support sparse matrices.") + raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None @@ -206,22 +226,20 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, msg = "Data was converted to boolean for metric %s" % metric warnings.warn(msg, DataConversionWarning) - X, Y = check_pairwise_arrays(X, Y, dtype=dtype, - force_all_finite=force_all_finite) + X, Y = check_pairwise_arrays(X, Y, dtype=dtype, force_all_finite=force_all_finite) # precompute data-derived metric params params = _precompute_metric_params(X, Y, metric=metric, **kwds) kwds.update(**params) if effective_n_jobs(n_jobs) == 1 and X is Y: - return distance.squareform(distance.pdist(X, metric=metric, - **kwds)) + return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds) -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): pairwise_distances = validate_params( { "X": ["array-like", "sparse matrix"], @@ -229,5 +247,6 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, "metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable], "n_jobs": [Integral, None], "force_all_finite": ["boolean", StrOptions({"allow-nan"})], - }, prefer_skip_nested_validation=True + }, + prefer_skip_nested_validation=True, )(pairwise_distances) diff --git a/daal4py/sklearn/metrics/_ranking.py b/daal4py/sklearn/metrics/_ranking.py index 8341dde30b..432e3d3568 100644 --- a/daal4py/sklearn/metrics/_ranking.py +++ b/daal4py/sklearn/metrics/_ranking.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,37 +12,44 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import daal4py as d4p -import numpy as np -from functools import partial +import logging from collections.abc import Sequence -from scipy import sparse as sp +from functools import partial +import numpy as np +from scipy import sparse as sp +from sklearn.preprocessing import label_binarize from sklearn.utils import check_array from sklearn.utils.multiclass import is_multilabel -from sklearn.preprocessing import label_binarize -from ..utils.validation import _assert_all_finite -from .._utils import get_patch_message, sklearn_check_version, PatchingConditionsChain +import daal4py as d4p + from .._device_offload import support_usm_ndarray -import logging +from .._utils import PatchingConditionsChain, get_patch_message, sklearn_check_version +from ..utils.validation import _assert_all_finite -if sklearn_check_version('0.22'): - from sklearn.metrics._ranking import _multiclass_roc_auc_score as \ - multiclass_roc_auc_score - from sklearn.metrics._ranking import _binary_roc_auc_score +if sklearn_check_version("0.22"): from sklearn.metrics._base import _average_binary_score + from sklearn.metrics._ranking import _binary_roc_auc_score + from sklearn.metrics._ranking import ( + _multiclass_roc_auc_score as multiclass_roc_auc_score, + ) else: from sklearn.metrics.ranking import roc_auc_score as multiclass_roc_auc_score -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): from sklearn.utils._param_validation import ( - validate_params, Interval, Real, StrOptions) + Interval, + Real, + StrOptions, + validate_params, + ) try: import pandas as pd + pandas_is_imported = True except ImportError: pandas_is_imported = False @@ -50,47 +57,52 @@ def _daal_type_of_target(y): valid = ( - isinstance( - y, Sequence) or sp.isspmatrix(y) or hasattr( - y, '__array__')) and not isinstance( - y, str) + isinstance(y, Sequence) or sp.isspmatrix(y) or hasattr(y, "__array__") + ) and not isinstance(y, str) if not valid: - raise ValueError('Expected array-like (array or non-string sequence), ' - 'got %r' % y) + raise ValueError( + "Expected array-like (array or non-string sequence), " "got %r" % y + ) - sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) + sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"] if sparse_pandas: raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if is_multilabel(y): - return 'multilabel-indicator' + return "multilabel-indicator" try: y = np.asarray(y) except ValueError: # Known to fail in numpy 1.3 for array of arrays - return 'unknown' + return "unknown" # The old sequence of sequences format try: - if not hasattr(y[0], '__array__') and isinstance( - y[0], Sequence) and not isinstance(y[0], str): - raise ValueError('You appear to be using a legacy multi-label data' - ' representation. Sequence of sequences are no' - ' longer supported; use a binary array or sparse' - ' matrix instead - the MultiLabelBinarizer' - ' transformer can convert to this format.') + if ( + not hasattr(y[0], "__array__") + and isinstance(y[0], Sequence) + and not isinstance(y[0], str) + ): + raise ValueError( + "You appear to be using a legacy multi-label data" + " representation. Sequence of sequences are no" + " longer supported; use a binary array or sparse" + " matrix instead - the MultiLabelBinarizer" + " transformer can convert to this format." + ) except IndexError: pass # Invalid inputs - if y.ndim > 2 or (y.dtype == object and len( - y) != 0 and not isinstance(y.flat[0], str)): - return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] + if y.ndim > 2 or ( + y.dtype == object and len(y) != 0 and not isinstance(y.flat[0], str) + ): + return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"] if y.ndim == 2 and y.shape[1] == 0: - return 'unknown' # [[]] + return "unknown" # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] @@ -98,20 +110,18 @@ def _daal_type_of_target(y): suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values - if y.dtype.kind == 'f' and np.any(y != y.astype(int)): + if y.dtype.kind == "f" and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] _assert_all_finite(y) - return 'continuous' + suffix + return "continuous" + suffix - unique = np.sort( - pd.unique( - y.ravel())) if pandas_is_imported else np.unique(y) + unique = np.sort(pd.unique(y.ravel())) if pandas_is_imported else np.unique(y) if (len(unique) > 2) or (y.ndim >= 2 and len(y[0]) > 1): # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] - result = ('multiclass' + suffix, None) + result = ("multiclass" + suffix, None) else: - result = ('binary', unique) # [1, 2] or [["a"], ["b"]] + result = ("binary", unique) # [1, 2] or [["a"], ["b"]] return result @@ -130,46 +140,57 @@ def roc_auc_score( y_true = check_array(y_true, ensure_2d=False, dtype=None) y_score = check_array(y_score, ensure_2d=False) - _patching_status = PatchingConditionsChain( - "sklearn.metrics.roc_auc_score") - _dal_ready = _patching_status.and_conditions([ - (y_type[0] == "binary" and not (y_score.ndim == 2 and y_score.shape[1] > 2), - "y_true type is not one-dimensional binary.") - ]) + _patching_status = PatchingConditionsChain("sklearn.metrics.roc_auc_score") + _dal_ready = _patching_status.and_conditions( + [ + ( + y_type[0] == "binary" + and not (y_score.ndim == 2 and y_score.shape[1] > 2), + "y_true type is not one-dimensional binary.", + ) + ] + ) _patching_status.write_log() if y_type[0] == "multiclass" or ( y_type[0] == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 ): # do not support partial ROC computation for multiclass - if max_fpr is not None and max_fpr != 1.: - raise ValueError("Partial AUC computation not available in " - "multiclass setting, 'max_fpr' must be" - " set to `None`, received `max_fpr={0}` " - "instead".format(max_fpr)) - if multi_class == 'raise': + if max_fpr is not None and max_fpr != 1.0: + raise ValueError( + "Partial AUC computation not available in " + "multiclass setting, 'max_fpr' must be" + " set to `None`, received `max_fpr={0}` " + "instead".format(max_fpr) + ) + if multi_class == "raise": raise ValueError("multi_class must be in ('ovo', 'ovr')") return multiclass_roc_auc_score( - y_true, y_score, labels, multi_class, average, sample_weight) + y_true, y_score, labels, multi_class, average, sample_weight + ) if y_type[0] == "binary": labels = y_type[1] - _dal_ready = _patching_status.and_conditions([ - (len(labels) == 2, "Number of unique labels is not equal to 2."), - (max_fpr is None, "Maximum false-positive rate is not supported."), - (sample_weight is None, "Sample weights are not supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + (len(labels) == 2, "Number of unique labels is not equal to 2."), + (max_fpr is None, "Maximum false-positive rate is not supported."), + (sample_weight is None, "Sample weights are not supported."), + ] + ) if _dal_ready: if not np.array_equal(labels, [0, 1]) or labels.dtype == bool: y_true = label_binarize(y_true, classes=labels)[:, 0] - if hasattr(y_score, 'dtype') and y_score.dtype == bool: + if hasattr(y_score, "dtype") and y_score.dtype == bool: y_score = label_binarize(y_score, classes=labels)[:, 0] - result = d4p.daal_roc_auc_score(y_true.reshape(-1, 1), - y_score.reshape(-1, 1)) + result = d4p.daal_roc_auc_score(y_true.reshape(-1, 1), y_score.reshape(-1, 1)) if result != -1: return result - logging.info("sklearn.metrics.roc_auc_score: " + get_patch_message( - "sklearn_after_daal")) + logging.info( + "sklearn.metrics.roc_auc_score: " + + get_patch_message("sklearn_after_daal") + ) # return to sklearn implementation y_true = label_binarize(y_true, classes=labels)[:, 0] @@ -182,7 +203,7 @@ def roc_auc_score( ) -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): roc_auc_score = validate_params( { "y_true": ["array-like"], @@ -192,5 +213,6 @@ def roc_auc_score( "max_fpr": [Interval(Real, 0.0, 1, closed="right"), None], "multi_class": [StrOptions({"raise", "ovr", "ovo"})], "labels": ["array-like", None], - }, prefer_skip_nested_validation=True + }, + prefer_skip_nested_validation=True, )(roc_auc_score) diff --git a/daal4py/sklearn/model_selection/__init__.py b/daal4py/sklearn/model_selection/__init__.py index 4fd1ce1bc3..f1d827da72 100644 --- a/daal4py/sklearn/model_selection/__init__.py +++ b/daal4py/sklearn/model_selection/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._split import train_test_split -__all__ = ['train_test_split'] +__all__ = ["train_test_split"] diff --git a/daal4py/sklearn/model_selection/_split.py b/daal4py/sklearn/model_selection/_split.py index b4e2c58107..f60f57af28 100644 --- a/daal4py/sklearn/model_selection/_split.py +++ b/daal4py/sklearn/model_selection/_split.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,16 +13,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import platform + +import numpy as np +from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit +from sklearn.model_selection._split import _validate_shuffle_split from sklearn.utils import indexable from sklearn.utils.validation import _num_samples -from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit -from sklearn.model_selection._split import _validate_shuffle_split + import daal4py as d4p -import numpy as np from daal4py.sklearn._utils import PatchingConditionsChain -import platform + from .._device_offload import support_usm_ndarray from .._utils import sklearn_check_version @@ -33,28 +36,30 @@ try: import mkl_random + mkl_random_is_imported = True except (ImportError, ModuleNotFoundError): mkl_random_is_imported = False try: import pandas as pd + pandas_is_imported = True except (ImportError, ModuleNotFoundError): pandas_is_imported = False -if sklearn_check_version('1.3'): +if sklearn_check_version("1.3"): import numbers - from sklearn.utils._param_validation import ( - validate_params, Interval, RealNotInt) + + from sklearn.utils._param_validation import Interval, RealNotInt, validate_params def get_dtypes(data): - if hasattr(data, 'dtype'): + if hasattr(data, "dtype"): return [data.dtype] - if hasattr(data, 'dtypes'): + if hasattr(data, "dtypes"): return list(data.dtypes) - if hasattr(data, 'values'): + if hasattr(data, "values"): return [data.values.dtype] return None @@ -64,20 +69,32 @@ def train_test_split(*arrays, **options): n_arrays = len(arrays) if n_arrays == 0: raise ValueError("At least one array required as input") - test_size = options.pop('test_size', None) - train_size = options.pop('train_size', None) - random_state = options.pop('random_state', None) - stratify = options.pop('stratify', None) - shuffle = options.pop('shuffle', True) - rng = options.pop('rng', 'OPTIMIZED_MT19937') - - available_rngs = ['default', 'MT19937', 'SFMT19937', 'MT2203', 'R250', - 'WH', 'MCG31', 'MCG59', 'MRG32K3A', 'PHILOX4X32X10', - 'NONDETERM', 'OPTIMIZED_MT19937'] + test_size = options.pop("test_size", None) + train_size = options.pop("train_size", None) + random_state = options.pop("random_state", None) + stratify = options.pop("stratify", None) + shuffle = options.pop("shuffle", True) + rng = options.pop("rng", "OPTIMIZED_MT19937") + + available_rngs = [ + "default", + "MT19937", + "SFMT19937", + "MT2203", + "R250", + "WH", + "MCG31", + "MCG59", + "MRG32K3A", + "PHILOX4X32X10", + "NONDETERM", + "OPTIMIZED_MT19937", + ] if rng not in available_rngs: raise ValueError( "Wrong random numbers generator is chosen. " - "Available generators: %s" % str(available_rngs)[1:-1]) + "Available generators: %s" % str(available_rngs)[1:-1] + ) if options: raise TypeError("Invalid parameters passed: %s" % str(options)) @@ -91,83 +108,99 @@ def train_test_split(*arrays, **options): if shuffle is False: if stratify is not None: raise ValueError( - "Stratified train/test split is not implemented for shuffle=False") + "Stratified train/test split is not implemented for shuffle=False" + ) train = np.arange(n_train) test = np.arange(n_train, n_train + n_test) else: if stratify is not None: cv = StratifiedShuffleSplit( - test_size=n_test, - train_size=n_train, - random_state=random_state + test_size=n_test, train_size=n_train, random_state=random_state ) train, test = next(cv.split(X=arrays[0], y=stratify)) else: - if mkl_random_is_imported and \ - rng not in ['default', 'OPTIMIZED_MT19937'] and \ - (isinstance(random_state, int) or random_state is None): + if ( + mkl_random_is_imported + and rng not in ["default", "OPTIMIZED_MT19937"] + and (isinstance(random_state, int) or random_state is None) + ): random_state = mkl_random.RandomState(random_state, rng) indexes = random_state.permutation(n_samples) - test, train = indexes[:n_test], indexes[n_test:( - n_test + n_train)] - elif rng == 'OPTIMIZED_MT19937' and \ - (isinstance(random_state, int) or random_state is None) and \ - platform.system() != 'Windows': + test, train = indexes[:n_test], indexes[n_test : (n_test + n_train)] + elif ( + rng == "OPTIMIZED_MT19937" + and (isinstance(random_state, int) or random_state is None) + and platform.system() != "Windows" + ): indexes = np.empty( shape=(n_samples,), - dtype=np.int64 if n_train + n_test > 2 ** 31 - 1 else np.int32 + dtype=np.int64 if n_train + n_test > 2**31 - 1 else np.int32, ) random_state = np.random.RandomState(random_state) random_state = random_state.get_state()[1] d4p.daal_generate_shuffled_indices([indexes], [random_state]) - test, train = indexes[:n_test], indexes[n_test:( - n_test + n_train)] + test, train = indexes[:n_test], indexes[n_test : (n_test + n_train)] else: cv = ShuffleSplit( - test_size=n_test, - train_size=n_train, - random_state=random_state + test_size=n_test, train_size=n_train, random_state=random_state ) train, test = next(cv.split(X=arrays[0], y=stratify)) res = [] for arr in arrays: _patching_status = PatchingConditionsChain( - "sklearn.model_selection.train_test_split") + "sklearn.model_selection.train_test_split" + ) # input format check - _patching_status.and_conditions([ - (isinstance(arr, np.ndarray), "The input is not a np.ndarray object.")]) + _patching_status.and_conditions( + [(isinstance(arr, np.ndarray), "The input is not a np.ndarray object.")] + ) if pandas_is_imported: - _patching_status.or_conditions([ - (isinstance(arr, pd.core.frame.DataFrame), - "The input is not a pd.DataFrame object."), - (isinstance(arr, pd.core.series.Series), - "The input is not a pd.Series object.") - ], conditions_merging=any) + _patching_status.or_conditions( + [ + ( + isinstance(arr, pd.core.frame.DataFrame), + "The input is not a pd.DataFrame object.", + ), + ( + isinstance(arr, pd.core.series.Series), + "The input is not a pd.Series object.", + ), + ], + conditions_merging=any, + ) # dimensions check - _dal_ready = _patching_status.and_conditions([ - (hasattr(arr, 'ndim'), "The input does not have 'ndim' attribute.")]) - if hasattr(arr, 'ndim'): - _patching_status.and_conditions([ - (arr.ndim <= 2, "The input has more than 2 dimensions.")]) + _dal_ready = _patching_status.and_conditions( + [(hasattr(arr, "ndim"), "The input does not have 'ndim' attribute.")] + ) + if hasattr(arr, "ndim"): + _patching_status.and_conditions( + [(arr.ndim <= 2, "The input has more than 2 dimensions.")] + ) # data types check dtypes = get_dtypes(arr) - _dal_ready = _patching_status.and_conditions([ - (dtypes is not None, "Unable to parse input data types.")]) + _dal_ready = _patching_status.and_conditions( + [(dtypes is not None, "Unable to parse input data types.")] + ) if dtypes is not None: incorrect_dtype = None for i, dtype in enumerate(dtypes): - if 'float' not in str(dtype) and 'int' not in str(dtype): + if "float" not in str(dtype) and "int" not in str(dtype): incorrect_dtype = str(dtype) break - _dal_ready = _patching_status.and_conditions([ - (incorrect_dtype is None, - f"Input has incorrect data type '{incorrect_dtype}'. " - "Only integer and floating point types are supported.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + incorrect_dtype is None, + f"Input has incorrect data type '{incorrect_dtype}'. " + "Only integer and floating point types are supported.", + ) + ] + ) _patching_status.write_log() if not _dal_ready: @@ -185,10 +218,10 @@ def train_test_split(*arrays, **options): if not isinstance(arr_copy, list): arr_copy = arr_copy.reshape( (arr_copy.shape[0], n_cols), - order='A', + order="A", ) if isinstance(arr_copy, np.ndarray): - order = 'C' if arr_copy.flags['C_CONTIGUOUS'] else 'F' + order = "C" if arr_copy.flags["C_CONTIGUOUS"] else "F" train_arr = np.empty( shape=(n_train, n_cols), dtype=arr_copy.dtype, @@ -199,55 +232,56 @@ def train_test_split(*arrays, **options): dtype=arr_copy.dtype, order=order, ) - d4p.daal_train_test_split( - arr_copy, train_arr, test_arr, [train], [test] - ) + d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) if reshape_later: - train_arr, test_arr = train_arr.reshape( - (n_train,)), test_arr.reshape((n_test,)) + train_arr, test_arr = train_arr.reshape((n_train,)), test_arr.reshape( + (n_test,) + ) elif isinstance(arr_copy, list): train_arr = [ np.empty( shape=(n_train,), dtype=el.dtype, - order='C' if el.flags['C_CONTIGUOUS'] else 'F', - ) for el in arr_copy + order="C" if el.flags["C_CONTIGUOUS"] else "F", + ) + for el in arr_copy ] test_arr = [ np.empty( shape=(n_test,), dtype=el.dtype, - order='C' if el.flags['C_CONTIGUOUS'] else 'F' - ) for el in arr_copy + order="C" if el.flags["C_CONTIGUOUS"] else "F", + ) + for el in arr_copy ] - d4p.daal_train_test_split( - arr_copy, train_arr, test_arr, [train], [test]) - train_arr = {col: train_arr[i] - for i, col in enumerate(arr.columns)} - test_arr = {col: test_arr[i] - for i, col in enumerate(arr.columns)} + d4p.daal_train_test_split(arr_copy, train_arr, test_arr, [train], [test]) + train_arr = {col: train_arr[i] for i, col in enumerate(arr.columns)} + test_arr = {col: test_arr[i] for i, col in enumerate(arr.columns)} else: - raise ValueError('Array can\'t be converted to needed format') + raise ValueError("Array can't be converted to needed format") if pandas_is_imported: if isinstance(arr, pd.core.frame.DataFrame): - train_arr, test_arr = pd.DataFrame(train_arr, columns=arr.columns), \ - pd.DataFrame(test_arr, columns=arr.columns) + train_arr, test_arr = pd.DataFrame( + train_arr, columns=arr.columns + ), pd.DataFrame(test_arr, columns=arr.columns) if isinstance(arr, pd.core.series.Series): - train_arr, test_arr = \ - train_arr.reshape(n_train), test_arr.reshape(n_test) - train_arr, test_arr = pd.Series(train_arr, name=arr.name), \ - pd.Series(test_arr, name=arr.name) + train_arr, test_arr = train_arr.reshape(n_train), test_arr.reshape( + n_test + ) + train_arr, test_arr = pd.Series(train_arr, name=arr.name), pd.Series( + test_arr, name=arr.name + ) - if hasattr(arr, 'index'): + if hasattr(arr, "index"): train_arr.index = train test_arr.index = test - if hasattr(arr, 'columns'): + if hasattr(arr, "columns"): train_arr.columns = arr.columns test_arr.columns = arr.columns - if hasattr(arr, 'name'): + if hasattr(arr, "name"): train_arr.name = arr.name test_arr.name = arr.name @@ -257,19 +291,22 @@ def train_test_split(*arrays, **options): return res -if sklearn_check_version('1.3'): - train_test_split = validate_params({ - "test_size": [ - Interval(RealNotInt, 0, 1, closed="neither"), - Interval(numbers.Integral, 1, None, closed="left"), - None, - ], - "train_size": [ - Interval(RealNotInt, 0, 1, closed="neither"), - Interval(numbers.Integral, 1, None, closed="left"), - None, - ], - "random_state": ["random_state"], - "shuffle": ["boolean"], - "stratify": ["array-like", None], - }, prefer_skip_nested_validation=True)(train_test_split) +if sklearn_check_version("1.3"): + train_test_split = validate_params( + { + "test_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "train_size": [ + Interval(RealNotInt, 0, 1, closed="neither"), + Interval(numbers.Integral, 1, None, closed="left"), + None, + ], + "random_state": ["random_state"], + "shuffle": ["boolean"], + "stratify": ["array-like", None], + }, + prefer_skip_nested_validation=True, + )(train_test_split) diff --git a/daal4py/sklearn/model_selection/tests/test_split.py b/daal4py/sklearn/model_selection/tests/test_split.py index 8922ac4d0c..037188135d 100644 --- a/daal4py/sklearn/model_selection/tests/test_split.py +++ b/daal4py/sklearn/model_selection/tests/test_split.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,43 +12,45 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np import pytest -from sklearn.model_selection import train_test_split as skl_train_test_split -from daal4py.sklearn.model_selection import train_test_split as d4p_train_test_split -from daal4py.sklearn._utils import daal_check_version from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split as skl_train_test_split +from daal4py.sklearn._utils import daal_check_version +from daal4py.sklearn.model_selection import train_test_split as d4p_train_test_split -N_SAMPLES = [2 ** i + 1 for i in range(2, 17)] +N_SAMPLES = [2**i + 1 for i in range(2, 17)] RANDOM_STATE = 777 @pytest.mark.skipif( - not daal_check_version((2021, 'P', 400)), - reason='train_test_split has bugfix since 2021.4 release') -@pytest.mark.parametrize('n_samples', N_SAMPLES) + not daal_check_version((2021, "P", 400)), + reason="train_test_split has bugfix since 2021.4 release", +) +@pytest.mark.parametrize("n_samples", N_SAMPLES) def test_results_similarity(n_samples): x, y = make_classification( - n_samples=n_samples, n_features=4, random_state=RANDOM_STATE) + n_samples=n_samples, n_features=4, random_state=RANDOM_STATE + ) d4p_res = d4p_train_test_split( x, y, test_size=n_samples // 2 - 1, train_size=n_samples // 2 - 1, - random_state=RANDOM_STATE) + random_state=RANDOM_STATE, + ) skl_res = skl_train_test_split( x, y, test_size=n_samples // 2 - 1, train_size=n_samples // 2 - 1, - random_state=RANDOM_STATE) + random_state=RANDOM_STATE, + ) - assert len(d4p_res) == len( - skl_res), 'train_test_splits have different output size' + assert len(d4p_res) == len(skl_res), "train_test_splits have different output size" for i, _ in enumerate(d4p_res): - assert np.all(d4p_res[i] == skl_res[i] - ), 'train_test_splits have different output' + assert np.all(d4p_res[i] == skl_res[i]), "train_test_splits have different output" diff --git a/daal4py/sklearn/monkeypatch/dispatcher.py b/daal4py/sklearn/monkeypatch/dispatcher.py index e8c3b02725..93745d66e8 100755 --- a/daal4py/sklearn/monkeypatch/dispatcher.py +++ b/daal4py/sklearn/monkeypatch/dispatcher.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,90 +12,120 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import set_idp_sklearn_verbose -from ..neighbors import KNeighborsRegressor as KNeighborsRegressor_daal4py -from ..neighbors import NearestNeighbors as NearestNeighbors_daal4py -from ..neighbors import KNeighborsClassifier as KNeighborsClassifier_daal4py -from ..model_selection import train_test_split -from ..utils.validation import _assert_all_finite -from ..svm.svm import SVC as SVC_daal4py -from ..ensemble._forest import RandomForestClassifier as RandomForestClassifier_daal4py -from ..ensemble._forest import RandomForestRegressor as RandomForestRegressor_daal4py -from ..metrics import roc_auc_score -from ..metrics import pairwise_distances -from ..cluster.k_means import KMeans as KMeans_daal4py -from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py -from ..linear_model.coordinate_descent import Lasso as Lasso_daal4py -from ..linear_model.coordinate_descent import ElasticNet as ElasticNet_daal4py -from ..linear_model.linear import LinearRegression as LinearRegression_daal4py -from ..linear_model.ridge import Ridge as Ridge_daal4py -from ..linear_model.logistic_path import LogisticRegression as LogisticRegression_daal4py -from ..linear_model.logistic_path import logistic_regression_path as \ - daal_optimized_logistic_path -from ..decomposition._pca import PCA as PCA_daal4py -from ..manifold import TSNE as TSNE_daal4py -from sklearn import model_selection -from sklearn import metrics -from sklearn.utils import validation import sys +import warnings from functools import lru_cache import sklearn.cluster as cluster_module -import sklearn.ensemble as ensemble_module -import sklearn.svm as svm_module -import sklearn.linear_model._logistic as logistic_module -import sklearn.neighbors as neighbors_module import sklearn.decomposition as decomposition_module +import sklearn.ensemble as ensemble_module import sklearn.linear_model as linear_model_module +import sklearn.linear_model._logistic as logistic_module import sklearn.manifold as manifold_module +import sklearn.neighbors as neighbors_module +import sklearn.svm as svm_module +from sklearn import metrics, model_selection +from sklearn.utils import validation -import warnings +from daal4py.sklearn._utils import set_idp_sklearn_verbose + +from ..cluster.dbscan import DBSCAN as DBSCAN_daal4py +from ..cluster.k_means import KMeans as KMeans_daal4py +from ..decomposition._pca import PCA as PCA_daal4py +from ..ensemble._forest import RandomForestClassifier as RandomForestClassifier_daal4py +from ..ensemble._forest import RandomForestRegressor as RandomForestRegressor_daal4py +from ..linear_model.coordinate_descent import ElasticNet as ElasticNet_daal4py +from ..linear_model.coordinate_descent import Lasso as Lasso_daal4py +from ..linear_model.linear import LinearRegression as LinearRegression_daal4py +from ..linear_model.logistic_path import LogisticRegression as LogisticRegression_daal4py +from ..linear_model.logistic_path import ( + logistic_regression_path as daal_optimized_logistic_path, +) +from ..linear_model.ridge import Ridge as Ridge_daal4py +from ..manifold import TSNE as TSNE_daal4py +from ..metrics import pairwise_distances, roc_auc_score +from ..model_selection import train_test_split +from ..neighbors import KNeighborsClassifier as KNeighborsClassifier_daal4py +from ..neighbors import KNeighborsRegressor as KNeighborsRegressor_daal4py +from ..neighbors import NearestNeighbors as NearestNeighbors_daal4py +from ..svm.svm import SVC as SVC_daal4py +from ..utils.validation import _assert_all_finite @lru_cache(maxsize=None) def _get_map_of_algorithms(): mapping = { - 'pca': [[(decomposition_module, 'PCA', PCA_daal4py), None]], - 'kmeans': [[(cluster_module, 'KMeans', KMeans_daal4py), None]], - 'dbscan': [[(cluster_module, 'DBSCAN', DBSCAN_daal4py), None]], - 'distances': [[(metrics, 'pairwise_distances', pairwise_distances), None]], - 'linear': [[(linear_model_module, 'LinearRegression', - LinearRegression_daal4py), None]], - 'ridge': [[(linear_model_module, 'Ridge', Ridge_daal4py), None]], - 'elasticnet': [[(linear_model_module, 'ElasticNet', ElasticNet_daal4py), None]], - 'lasso': [[(linear_model_module, 'Lasso', Lasso_daal4py), None]], - 'svm': [[(svm_module, 'SVC', SVC_daal4py), None]], - 'logistic': [[(logistic_module, '_logistic_regression_path', - daal_optimized_logistic_path), None]], - 'log_reg': [[(linear_model_module, 'LogisticRegression', - LogisticRegression_daal4py), None]], - 'knn_classifier': [[(neighbors_module, 'KNeighborsClassifier', - KNeighborsClassifier_daal4py), None]], - 'nearest_neighbors': [[(neighbors_module, 'NearestNeighbors', - NearestNeighbors_daal4py), None]], - 'knn_regressor': [[(neighbors_module, 'KNeighborsRegressor', - KNeighborsRegressor_daal4py), None]], - 'random_forest_classifier': [[(ensemble_module, 'RandomForestClassifier', - RandomForestClassifier_daal4py), None]], - 'random_forest_regressor': [[(ensemble_module, 'RandomForestRegressor', - RandomForestRegressor_daal4py), None]], - 'train_test_split': [[(model_selection, 'train_test_split', - train_test_split), None]], - 'fin_check': [[(validation, '_assert_all_finite', - _assert_all_finite), None]], - 'roc_auc_score': [[(metrics, 'roc_auc_score', - roc_auc_score), None]], - 'tsne': [[(manifold_module, 'TSNE', TSNE_daal4py), None]], + "pca": [[(decomposition_module, "PCA", PCA_daal4py), None]], + "kmeans": [[(cluster_module, "KMeans", KMeans_daal4py), None]], + "dbscan": [[(cluster_module, "DBSCAN", DBSCAN_daal4py), None]], + "distances": [[(metrics, "pairwise_distances", pairwise_distances), None]], + "linear": [ + [(linear_model_module, "LinearRegression", LinearRegression_daal4py), None] + ], + "ridge": [[(linear_model_module, "Ridge", Ridge_daal4py), None]], + "elasticnet": [[(linear_model_module, "ElasticNet", ElasticNet_daal4py), None]], + "lasso": [[(linear_model_module, "Lasso", Lasso_daal4py), None]], + "svm": [[(svm_module, "SVC", SVC_daal4py), None]], + "logistic": [ + [ + ( + logistic_module, + "_logistic_regression_path", + daal_optimized_logistic_path, + ), + None, + ] + ], + "log_reg": [ + [ + (linear_model_module, "LogisticRegression", LogisticRegression_daal4py), + None, + ] + ], + "knn_classifier": [ + [ + (neighbors_module, "KNeighborsClassifier", KNeighborsClassifier_daal4py), + None, + ] + ], + "nearest_neighbors": [ + [(neighbors_module, "NearestNeighbors", NearestNeighbors_daal4py), None] + ], + "knn_regressor": [ + [(neighbors_module, "KNeighborsRegressor", KNeighborsRegressor_daal4py), None] + ], + "random_forest_classifier": [ + [ + ( + ensemble_module, + "RandomForestClassifier", + RandomForestClassifier_daal4py, + ), + None, + ] + ], + "random_forest_regressor": [ + [ + (ensemble_module, "RandomForestRegressor", RandomForestRegressor_daal4py), + None, + ] + ], + "train_test_split": [ + [(model_selection, "train_test_split", train_test_split), None] + ], + "fin_check": [[(validation, "_assert_all_finite", _assert_all_finite), None]], + "roc_auc_score": [[(metrics, "roc_auc_score", roc_auc_score), None]], + "tsne": [[(manifold_module, "TSNE", TSNE_daal4py), None]], } - mapping['svc'] = mapping['svm'] - mapping['logisticregression'] = mapping['log_reg'] - mapping['kneighborsclassifier'] = mapping['knn_classifier'] - mapping['nearestneighbors'] = mapping['nearest_neighbors'] - mapping['kneighborsregressor'] = mapping['knn_regressor'] - mapping['randomforestclassifier'] = mapping['random_forest_classifier'] - mapping['randomforestregressor'] = mapping['random_forest_regressor'] + mapping["svc"] = mapping["svm"] + mapping["logisticregression"] = mapping["log_reg"] + mapping["kneighborsclassifier"] = mapping["knn_classifier"] + mapping["nearestneighbors"] = mapping["nearest_neighbors"] + mapping["kneighborsregressor"] = mapping["knn_regressor"] + mapping["randomforestclassifier"] = mapping["random_forest_classifier"] + mapping["randomforestregressor"] = mapping["random_forest_regressor"] mapping["linearregression"] = mapping["linear"] mapping["logisticregression"] = mapping["log_reg"] mapping["_logistic_regression_path"] = mapping["logistic"] @@ -135,23 +165,28 @@ def enable(name=None, verbose=True, deprecation=True, get_map=_get_map_of_algori do_patch(key, get_map) if deprecation: set_idp_sklearn_verbose() - warnings.warn_explicit("\nScikit-learn patching with daal4py is deprecated " - "and will be removed in the future.\n" - "Use Intel(R) Extension " - "for Scikit-learn* module instead " - "(pip install scikit-learn-intelex).\n" - "To enable patching, please use one of the " - "following options:\n" - "1) From the command line:\n" - " python -m sklearnex \n" - "2) From your script:\n" - " from sklearnex import patch_sklearn\n" - " patch_sklearn()", - FutureWarning, "dispatcher.py", 151) + warnings.warn_explicit( + "\nScikit-learn patching with daal4py is deprecated " + "and will be removed in the future.\n" + "Use Intel(R) Extension " + "for Scikit-learn* module instead " + "(pip install scikit-learn-intelex).\n" + "To enable patching, please use one of the " + "following options:\n" + "1) From the command line:\n" + " python -m sklearnex \n" + "2) From your script:\n" + " from sklearnex import patch_sklearn\n" + " patch_sklearn()", + FutureWarning, + "dispatcher.py", + 151, + ) if verbose and deprecation and sys.stderr is not None: sys.stderr.write( "Intel(R) oneAPI Data Analytics Library solvers for sklearn enabled: " - "https://intelpython.github.io/daal4py/sklearn.html\n") + "https://intelpython.github.io/daal4py/sklearn.html\n" + ) def disable(name=None, get_map=_get_map_of_algorithms): diff --git a/daal4py/sklearn/monkeypatch/tests/_models_info.py b/daal4py/sklearn/monkeypatch/tests/_models_info.py index aa1f3b6f81..47dbf138e0 100644 --- a/daal4py/sklearn/monkeypatch/tests/_models_info.py +++ b/daal4py/sklearn/monkeypatch/tests/_models_info.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,105 +12,113 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np - -from sklearn.svm import SVC -from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor) -from sklearn.neighbors import ( - KNeighborsClassifier, - KNeighborsRegressor, - NearestNeighbors) +from sklearn.cluster import DBSCAN, KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import ( + ElasticNet, + Lasso, + LinearRegression, LogisticRegression, LogisticRegressionCV, - LinearRegression, Ridge, - ElasticNet, - Lasso) -from sklearn.cluster import (KMeans, DBSCAN) +) from sklearn.manifold import TSNE -from sklearn.decomposition import PCA +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors +from sklearn.svm import SVC + from daal4py.sklearn._utils import daal_check_version MODELS_INFO = [ { - 'model': KNeighborsClassifier(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": KNeighborsClassifier(algorithm="brute"), + "methods": ["kneighbors", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KNeighborsRegressor(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'score'], - 'dataset': 'regression', + "model": KNeighborsRegressor(algorithm="brute"), + "methods": ["kneighbors", "predict", "score"], + "dataset": "regression", }, { - 'model': NearestNeighbors(algorithm='brute'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(algorithm="brute"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': DBSCAN(), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": DBSCAN(), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': SVC(probability=True), - 'methods': ['decision_function', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": SVC(probability=True), + "methods": ["decision_function", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KMeans(), - 'methods': ['fit_predict', 'fit_transform', 'transform', 'predict', 'score'], - 'dataset': 'blobs', + "model": KMeans(), + "methods": ["fit_predict", "fit_transform", "transform", "predict", "score"], + "dataset": "blobs", }, { - 'model': ElasticNet(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": ElasticNet(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Lasso(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Lasso(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': PCA(), - 'methods': ['fit_transform', 'transform', 'score'], - 'dataset': 'classifier', + "model": PCA(), + "methods": ["fit_transform", "transform", "score"], + "dataset": "classifier", }, { - 'model': RandomForestClassifier(n_estimators=10), - 'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": RandomForestClassifier(n_estimators=10), + "methods": ["predict", "predict_proba", "predict_log_proba", "score"], + "dataset": "classifier", }, { - 'model': LogisticRegression(max_iter=100, multi_class='multinomial'), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegression(max_iter=100, multi_class="multinomial"), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(max_iter=100), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegressionCV(max_iter=100), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': RandomForestRegressor(n_estimators=10), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": RandomForestRegressor(n_estimators=10), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': LinearRegression(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": LinearRegression(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Ridge(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Ridge(), + "methods": ["predict", "score"], + "dataset": "regression", }, ] @@ -130,21 +138,22 @@ TO_SKIP = [ # --------------- NO INFO --------------- - r'KMeans .*transform', - r'KMeans .*score', - r'PCA .*score', - r'LogisticRegression .*decision_function', - r'LogisticRegressionCV .*decision_function', - r'LogisticRegressionCV .*predict', - r'LogisticRegressionCV .*predict_proba', - r'LogisticRegressionCV .*predict_log_proba', - r'LogisticRegressionCV .*score', + r"KMeans .*transform", + r"KMeans .*score", + r"PCA .*score", + r"LogisticRegression .*decision_function", + r"LogisticRegressionCV .*decision_function", + r"LogisticRegressionCV .*predict", + r"LogisticRegressionCV .*predict_proba", + r"LogisticRegressionCV .*predict_log_proba", + r"LogisticRegressionCV .*score", # --------------- Scikit --------------- - r'Ridge float16 predict', - r'Ridge float16 score', - r'RandomForestClassifier .*predict_proba', - r'RandomForestClassifier .*predict_log_proba', - r'pairwise_distances .*pairwise_distances', # except float64 - r'roc_auc_score .*roc_auc_score' \ - if not daal_check_version((2021, 'P', 200)) else None, + r"Ridge float16 predict", + r"Ridge float16 score", + r"RandomForestClassifier .*predict_proba", + r"RandomForestClassifier .*predict_log_proba", + r"pairwise_distances .*pairwise_distances", # except float64 + r"roc_auc_score .*roc_auc_score" + if not daal_check_version((2021, "P", 200)) + else None, ] diff --git a/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py b/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py index a9f7ca8de4..d11675eb8d 100755 --- a/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py +++ b/daal4py/sklearn/monkeypatch/tests/test_monkeypatch.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import daal4py.sklearn @@ -25,8 +25,9 @@ def test_monkey_patching(): _classes.append(v[0][0]) assert len(_tokens) == len(_classes) - assert isinstance(_tokens, list) and len(_tokens) > 0, \ - "Internal Error: list of patched names has unexcepable format." + assert ( + isinstance(_tokens, list) and len(_tokens) > 0 + ), "Internal Error: list of patched names has unexcepable format." daal4py.sklearn.patch_sklearn() @@ -36,8 +37,7 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert class_module.startswith('daal4py'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py"), "Patching has completed with error." for i, _ in enumerate(_tokens): t = _tokens[i] @@ -46,8 +46,7 @@ def test_monkey_patching(): daal4py.sklearn.unpatch_sklearn(t) class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." daal4py.sklearn.unpatch_sklearn() @@ -57,8 +56,7 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." for i, _ in enumerate(_tokens): t = _tokens[i] @@ -68,7 +66,6 @@ def test_monkey_patching(): daal4py.sklearn.patch_sklearn(t) class_module = getattr(p, n).__module__ - assert class_module.startswith('daal4py'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py"), "Patching has completed with error." daal4py.sklearn.unpatch_sklearn() diff --git a/daal4py/sklearn/monkeypatch/tests/test_patching.py b/daal4py/sklearn/monkeypatch/tests/test_patching.py index 23d5117007..9a5657d752 100644 --- a/daal4py/sklearn/monkeypatch/tests/test_patching.py +++ b/daal4py/sklearn/monkeypatch/tests/test_patching.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,27 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import os +import pathlib import re import subprocess import sys -import os -import pathlib + import pytest from _models_info import TO_SKIP def get_branch(s): if len(s) == 0: - return 'NO INFO' + return "NO INFO" for i in s: - if 'failed to run accelerated version, fallback to original Scikit-learn' in i: - return 'was in OPT, but go in Scikit' + if "failed to run accelerated version, fallback to original Scikit-learn" in i: + return "was in OPT, but go in Scikit" for i in s: - if 'running accelerated version' in i: - return 'OPT' - return 'Scikit' + if "running accelerated version" in i: + return "OPT" + return "Scikit" def run_parse(mas, result): @@ -41,8 +42,8 @@ def run_parse(mas, result): INFO_POS = 6 for i in range(1, len(mas)): mas[i] = mas[i][INFO_POS:] # remove 'INFO: ' - if not mas[i].startswith('sklearn'): - ind = name + ' ' + dtype + ' ' + mas[i] + if not mas[i].startswith("sklearn"): + ind = name + " " + dtype + " " + mas[i] result[ind] = get_branch(temp) temp.clear() else: @@ -50,14 +51,11 @@ def run_parse(mas, result): def get_result_log(): - os.environ['IDP_SKLEARN_VERBOSE'] = 'INFO' + os.environ["IDP_SKLEARN_VERBOSE"] = "INFO" absolute_path = str(pathlib.Path(__file__).parent.absolute()) try: process = subprocess.check_output( - [ - sys.executable, - absolute_path + '/utils/_launch_algorithms.py' - ] + [sys.executable, absolute_path + "/utils/_launch_algorithms.py"] ) except subprocess.CalledProcessError as e: print(e) @@ -65,25 +63,25 @@ def get_result_log(): mas = [] result = {} - for i in process.decode().split('\n'): - if not i.startswith('INFO') and len(mas) != 0: + for i in process.decode().split("\n"): + if not i.startswith("INFO") and len(mas) != 0: run_parse(mas, result) mas.clear() mas.append(i.strip()) else: mas.append(i.strip()) - del os.environ['IDP_SKLEARN_VERBOSE'] + del os.environ["IDP_SKLEARN_VERBOSE"] return result result_log = get_result_log() -@pytest.mark.parametrize('configuration', result_log) +@pytest.mark.parametrize("configuration", result_log) def test_patching(configuration): - if 'OPT' in result_log[configuration]: + if "OPT" in result_log[configuration]: return for skip in TO_SKIP: if re.search(skip, configuration) is not None: pytest.skip("SKIPPED", allow_module_level=False) - raise ValueError('Test patching failed: ' + configuration) + raise ValueError("Test patching failed: " + configuration) diff --git a/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py b/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py index b1232f1278..ed867b19d9 100644 --- a/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py +++ b/daal4py/sklearn/monkeypatch/tests/utils/_launch_algorithms.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import logging import random +import numpy as np + from daal4py.sklearn import patch_sklearn + patch_sklearn() +import pathlib +import sys + +from sklearn.datasets import load_diabetes, load_iris, make_regression from sklearn.metrics import pairwise_distances, roc_auc_score -from sklearn.datasets import ( - make_regression, - load_iris, - load_diabetes) -import sys -import pathlib absolute_path = str(pathlib.Path(__file__).parent.absolute()) -sys.path.append(absolute_path + '/../') +sys.path.append(absolute_path + "/../") from _models_info import MODELS_INFO, TYPES @@ -39,80 +39,80 @@ def get_class_name(x): def generate_dataset(name, dtype, model_name): - if model_name == 'LinearRegression': + if model_name == "LinearRegression": X, y = make_regression(n_samples=1000, n_features=5) - elif name in ['blobs', 'classifier']: + elif name in ["blobs", "classifier"]: X, y = load_iris(return_X_y=True) - elif name == 'regression': + elif name == "regression": X, y = load_diabetes(return_X_y=True) else: - raise ValueError('Unknown dataset type') + raise ValueError("Unknown dataset type") X = np.array(X, dtype=dtype) y = np.array(y, dtype=dtype) return (X, y) def run_patch(model_info, dtype): - print(get_class_name(model_info['model']), dtype.__name__) - X, y = generate_dataset(model_info['dataset'], - dtype, - get_class_name(model_info['model'])) - model = model_info['model'] + print(get_class_name(model_info["model"]), dtype.__name__) + X, y = generate_dataset( + model_info["dataset"], dtype, get_class_name(model_info["model"]) + ) + model = model_info["model"] model.fit(X, y) - logging.info('fit') - for i in model_info['methods']: - if i == 'predict': + logging.info("fit") + for i in model_info["methods"]: + if i == "predict": model.predict(X) - elif i == 'predict_proba': + elif i == "predict_proba": model.predict_proba(X) - elif i == 'predict_log_proba': + elif i == "predict_log_proba": model.predict_log_proba(X) - elif i == 'decision_function': + elif i == "decision_function": model.decision_function(X) - elif i == 'fit_predict': + elif i == "fit_predict": model.fit_predict(X) - elif i == 'transform': + elif i == "transform": model.transform(X) - elif i == 'fit_transform': + elif i == "fit_transform": model.fit_transform(X) - elif i == 'kneighbors': + elif i == "kneighbors": model.kneighbors(X) - elif i == 'score': + elif i == "score": model.score(X, y) else: - raise ValueError(i + ' is wrong method') + raise ValueError(i + " is wrong method") logging.info(i) def run_algotithms(): for info in MODELS_INFO: for t in TYPES: - model_name = get_class_name(info['model']) - if model_name in ['Ridge', 'LinearRegression'] and t.__name__ == 'uint32': + model_name = get_class_name(info["model"]) + if model_name in ["Ridge", "LinearRegression"] and t.__name__ == "uint32": continue run_patch(info, t) def run_utils(): # pairwise_distances - for metric in ['cosine', 'correlation']: + for metric in ["cosine", "correlation"]: for t in TYPES: X = np.random.rand(1000) X = np.array(X, dtype=t) - print('pairwise_distances', t.__name__) + print("pairwise_distances", t.__name__) _ = pairwise_distances(X.reshape(1, -1), metric=metric) - logging.info('pairwise_distances') + logging.info("pairwise_distances") # roc_auc_score for t in [np.float32, np.float64]: a = [random.randint(0, 1) for i in range(1000)] b = [random.randint(0, 1) for i in range(1000)] a = np.array(a, dtype=t) b = np.array(b, dtype=t) - print('roc_auc_score', t.__name__) + print("roc_auc_score", t.__name__) _ = roc_auc_score(a, b) - logging.info('roc_auc_score') + logging.info("roc_auc_score") -if __name__ == '__main__': +if __name__ == "__main__": run_algotithms() run_utils() diff --git a/daal4py/sklearn/neighbors/__init__.py b/daal4py/sklearn/neighbors/__init__.py index 0aecdf94d6..901f7ebce1 100755 --- a/daal4py/sklearn/neighbors/__init__.py +++ b/daal4py/sklearn/neighbors/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from ._classification import KNeighborsClassifier -from ._unsupervised import NearestNeighbors from ._regression import KNeighborsRegressor +from ._unsupervised import NearestNeighbors -__all__ = ['KNeighborsClassifier', 'NearestNeighbors', 'KNeighborsRegressor'] +__all__ = ["KNeighborsClassifier", "NearestNeighbors", "KNeighborsRegressor"] diff --git a/daal4py/sklearn/neighbors/_base.py b/daal4py/sklearn/neighbors/_base.py index 4ed877a928..9bc3491fd6 100644 --- a/daal4py/sklearn/neighbors/_base.py +++ b/daal4py/sklearn/neighbors/_base.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,69 +12,75 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN scikit-learn-compatible base classes -import numpy as np -import numbers -import daal4py as d4p -from scipy import sparse as sp -from .._utils import ( - getFPType, - sklearn_check_version, - get_patch_message, - PatchingConditionsChain) -from sklearn.utils.validation import check_array, check_is_fitted, check_X_y -from sklearn.utils.multiclass import check_classification_targets -from sklearn.base import is_classifier, is_regressor import logging +import numbers import warnings +import numpy as np +from scipy import sparse as sp +from sklearn.base import is_classifier, is_regressor +from sklearn.neighbors import VALID_METRICS +from sklearn.neighbors._ball_tree import BallTree from sklearn.neighbors._base import KNeighborsMixin as BaseKNeighborsMixin -from sklearn.neighbors._base import RadiusNeighborsMixin as BaseRadiusNeighborsMixin from sklearn.neighbors._base import NeighborsBase as BaseNeighborsBase -from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._base import RadiusNeighborsMixin as BaseRadiusNeighborsMixin from sklearn.neighbors._kd_tree import KDTree -from sklearn.neighbors import VALID_METRICS +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_is_fitted, check_X_y + +import daal4py as d4p + +from .._utils import ( + PatchingConditionsChain, + get_patch_message, + getFPType, + sklearn_check_version, +) + if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights def training_algorithm(method, fptype, params): - if method == 'brute': + if method == "brute": train_alg = d4p.bf_knn_classification_training else: train_alg = d4p.kdtree_knn_classification_training - params['fptype'] = fptype + params["fptype"] = fptype return train_alg(**params) def prediction_algorithm(method, fptype, params): - if method == 'brute': + if method == "brute": predict_alg = d4p.bf_knn_classification_prediction else: predict_alg = d4p.kdtree_knn_classification_prediction - params['fptype'] = fptype + params["fptype"] = fptype return predict_alg(**params) def parse_auto_method(estimator, method, n_samples, n_features): result_method = method - if (method in ['auto', 'ball_tree']): - condition = estimator.n_neighbors is not None and \ - estimator.n_neighbors >= estimator.n_samples_fit_ // 2 - if estimator.metric == 'precomputed' or n_features > 11 or condition: - result_method = 'brute' + if method in ["auto", "ball_tree"]: + condition = ( + estimator.n_neighbors is not None + and estimator.n_neighbors >= estimator.n_samples_fit_ // 2 + ) + if estimator.metric == "precomputed" or n_features > 11 or condition: + result_method = "brute" else: - if estimator.effective_metric_ in VALID_METRICS['kd_tree']: - result_method = 'kd_tree' + if estimator.effective_metric_ in VALID_METRICS["kd_tree"]: + result_method = "kd_tree" else: - result_method = 'brute' + result_method = "brute" return result_method @@ -82,41 +88,45 @@ def parse_auto_method(estimator, method, n_samples, n_features): def daal4py_fit(estimator, X, fptype): estimator._fit_X = X estimator._fit_method = estimator.algorithm - estimator.effective_metric_ = 'euclidean' + estimator.effective_metric_ = "euclidean" estimator._tree = None - weights = getattr(estimator, 'weights', 'uniform') + weights = getattr(estimator, "weights", "uniform") params = { - 'method': 'defaultDense', - 'k': estimator.n_neighbors, - 'voteWeights': 'voteUniform' if weights == 'uniform' else 'voteDistance', - 'resultsToCompute': 'computeIndicesOfNeighbors|computeDistances', - 'resultsToEvaluate': 'none' if getattr(estimator, '_y', None) is None - else 'computeClassLabels' + "method": "defaultDense", + "k": estimator.n_neighbors, + "voteWeights": "voteUniform" if weights == "uniform" else "voteDistance", + "resultsToCompute": "computeIndicesOfNeighbors|computeDistances", + "resultsToEvaluate": "none" + if getattr(estimator, "_y", None) is None + else "computeClassLabels", } - if hasattr(estimator, 'classes_'): - params['nClasses'] = len(estimator.classes_) + if hasattr(estimator, "classes_"): + params["nClasses"] = len(estimator.classes_) - if getattr(estimator, '_y', None) is None: + if getattr(estimator, "_y", None) is None: labels = None else: labels = estimator._y.reshape(-1, 1) method = parse_auto_method( - estimator, estimator.algorithm, - estimator.n_samples_fit_, estimator.n_features_in_) + estimator, estimator.algorithm, estimator.n_samples_fit_, estimator.n_features_in_ + ) estimator._fit_method = method train_alg = training_algorithm(method, fptype, params) estimator._daal_model = train_alg.compute(X, labels).model -def daal4py_kneighbors(estimator, X=None, n_neighbors=None, - return_distance=True): - n_features = getattr(estimator, 'n_features_in_', None) - shape = getattr(X, 'shape', None) +def daal4py_kneighbors(estimator, X=None, n_neighbors=None, return_distance=True): + n_features = getattr(estimator, "n_features_in_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but kneighbors is expecting {n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but kneighbors is expecting {n_features} features as input" + ) + ) if sklearn_check_version("0.22"): check_is_fitted(estimator) @@ -126,20 +136,17 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, if n_neighbors is None: n_neighbors = estimator.n_neighbors elif n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not isinstance(n_neighbors, numbers.Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(n_neighbors)) + "enter integer value" % type(n_neighbors) + ) if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) else: query_is_train = True X = estimator._fit_X @@ -151,8 +158,7 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, if n_neighbors > n_samples_fit: raise ValueError( "Expected n_neighbors <= n_samples, " - " but n_samples = %d, n_neighbors = %d" % - (n_samples_fit, n_neighbors) + " but n_samples = %d, n_neighbors = %d" % (n_samples_fit, n_neighbors) ) chunked_results = None @@ -162,21 +168,23 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, except ValueError: fptype = None - weights = getattr(estimator, 'weights', 'uniform') + weights = getattr(estimator, "weights", "uniform") params = { - 'method': 'defaultDense', - 'k': n_neighbors, - 'voteWeights': 'voteUniform' if weights == 'uniform' else 'voteDistance', - 'resultsToCompute': 'computeIndicesOfNeighbors|computeDistances', - 'resultsToEvaluate': 'none' if getattr(estimator, '_y', None) is None - else 'computeClassLabels' + "method": "defaultDense", + "k": n_neighbors, + "voteWeights": "voteUniform" if weights == "uniform" else "voteDistance", + "resultsToCompute": "computeIndicesOfNeighbors|computeDistances", + "resultsToEvaluate": "none" + if getattr(estimator, "_y", None) is None + else "computeClassLabels", } - if hasattr(estimator, 'classes_'): - params['nClasses'] = len(estimator.classes_) + if hasattr(estimator, "classes_"): + params["nClasses"] = len(estimator.classes_) method = parse_auto_method( - estimator, estimator._fit_method, estimator.n_samples_fit_, n_features) + estimator, estimator._fit_method, estimator.n_samples_fit_, n_features + ) predict_alg = prediction_algorithm(method, fptype, params) prediction_result = predict_alg.compute(X, estimator._daal_model) @@ -184,7 +192,7 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, distances = prediction_result.distances indices = prediction_result.indices - if method == 'kd_tree': + if method == "kd_tree": for i in range(distances.shape[0]): seq = distances[i].argsort() indices[i] = indices[i][seq] @@ -222,18 +230,17 @@ def daal4py_kneighbors(estimator, X=None, n_neighbors=None, # In that case mask the first duplicate. dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False - neigh_ind = np.reshape( - neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) if return_distance: - neigh_dist = np.reshape( - neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) return neigh_dist, neigh_ind return neigh_ind -def validate_data(estimator, X, y=None, reset=True, - validate_separately=False, **check_params): +def validate_data( + estimator, X, y=None, reset=True, validate_separately=False, **check_params +): if y is None: try: requires_y = estimator._get_tags()["requires_y"] @@ -260,37 +267,59 @@ def validate_data(estimator, X, y=None, reset=True, X, y = check_X_y(X, y, **check_params) out = X, y - if sklearn_check_version("0.23") and check_params.get('ensure_2d', True): + if sklearn_check_version("0.23") and check_params.get("ensure_2d", True): estimator._check_n_features(X, reset=reset) return out class NeighborsBase(BaseNeighborsBase): - def __init__(self, n_neighbors=None, radius=None, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=None, + radius=None, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( - n_neighbors=n_neighbors, radius=radius, - algorithm=algorithm, leaf_size=leaf_size, metric=metric, - p=p, metric_params=metric_params, n_jobs=n_jobs) + n_neighbors=n_neighbors, + radius=radius, + algorithm=algorithm, + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) def _fit(self, X, y=None): - if self.metric_params is not None and 'p' in self.metric_params: + if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: - warnings.warn("Parameter p is found in metric_params. " - "The corresponding parameter from __init__ " - "is ignored.", SyntaxWarning, stacklevel=2) + warnings.warn( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored.", + SyntaxWarning, + stacklevel=2, + ) - if hasattr(self, 'weights') and sklearn_check_version("1.0") \ - and not sklearn_check_version("1.2"): + if ( + hasattr(self, "weights") + and sklearn_check_version("1.0") + and not sklearn_check_version("1.2") + ): self.weights = _check_weights(self.weights) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) X_incorrect_type = isinstance( - X, (KDTree, BallTree, NeighborsBase, BaseNeighborsBase)) + X, (KDTree, BallTree, NeighborsBase, BaseNeighborsBase) + ) single_output = True self._daal_model = None shape = None @@ -304,8 +333,13 @@ def _fit(self, X, y=None): if y is not None or requires_y: if not X_incorrect_type or y is None: X, y = validate_data( - self, X, y, accept_sparse="csr", multi_output=True, - dtype=[np.float64, np.float32]) + self, + X, + y, + accept_sparse="csr", + multi_output=True, + dtype=[np.float64, np.float32], + ) single_output = False if y.ndim > 1 and y.shape[1] > 1 else True shape = y.shape @@ -322,8 +356,7 @@ def _fit(self, X, y=None): self.classes_ = [] self._y = np.empty(y.shape, dtype=int) for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique( - y[:, k], return_inverse=True) + classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: @@ -338,7 +371,8 @@ def _fit(self, X, y=None): else: if not X_incorrect_type: X, _ = validate_data( - self, X, accept_sparse='csr', dtype=[np.float64, np.float32]) + self, X, accept_sparse="csr", dtype=[np.float64, np.float32] + ) if not X_incorrect_type: self.n_samples_fit_ = X.shape[0] @@ -349,7 +383,7 @@ def _fit(self, X, y=None): except ValueError: fptype = None - weights = getattr(self, 'weights', 'uniform') + weights = getattr(self, "weights", "uniform") def stock_fit(self, X, y): if sklearn_check_version("0.24"): @@ -360,34 +394,43 @@ def stock_fit(self, X, y): if self.n_neighbors is not None: if self.n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - self.n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) if not isinstance(self.n_neighbors, numbers.Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(self.n_neighbors)) + "enter integer value" % type(self.n_neighbors) + ) _patching_status = PatchingConditionsChain( - "sklearn.neighbors.KNeighborsMixin.kneighbors") - _dal_ready = _patching_status.and_conditions([ - (self.metric == 'minkowski' and self.p == 2 or self.metric == 'euclidean', - f"'{self.metric}' (p={self.p}) metric is not supported. " - "Only 'euclidean' or 'minkowski' with p=2 metrics are supported."), - (not X_incorrect_type, "X is not Tree or Neighbors instance or array."), - (weights in ['uniform', 'distance'], - f"'{weights}' weights is not supported. " - "Only 'uniform' and 'distance' weights are supported."), - (self.algorithm in ['brute', 'kd_tree', 'auto', 'ball_tree'], - f"'{self.algorithm}' algorithm is not supported. " - "Only 'brute', 'kd_tree', 'auto' and 'ball_tree' " - "algorithms are supported."), - (single_output, "Multiple outputs are not supported."), - (fptype is not None, "Unable to get dtype."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (correct_n_classes, "Number of classes < 2.")]) + "sklearn.neighbors.KNeighborsMixin.kneighbors" + ) + _dal_ready = _patching_status.and_conditions( + [ + ( + self.metric == "minkowski" + and self.p == 2 + or self.metric == "euclidean", + f"'{self.metric}' (p={self.p}) metric is not supported. " + "Only 'euclidean' or 'minkowski' with p=2 metrics are supported.", + ), + (not X_incorrect_type, "X is not Tree or Neighbors instance or array."), + ( + weights in ["uniform", "distance"], + f"'{weights}' weights is not supported. " + "Only 'uniform' and 'distance' weights are supported.", + ), + ( + self.algorithm in ["brute", "kd_tree", "auto", "ball_tree"], + f"'{self.algorithm}' algorithm is not supported. " + "Only 'brute', 'kd_tree', 'auto' and 'ball_tree' " + "algorithms are supported.", + ), + (single_output, "Multiple outputs are not supported."), + (fptype is not None, "Unable to get dtype."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (correct_n_classes, "Number of classes < 2."), + ] + ) _patching_status.write_log() if _dal_ready: try: @@ -396,7 +439,8 @@ def stock_fit(self, X, y): except RuntimeError: logging.info( "sklearn.neighbors.KNeighborsMixin." - "kneighbors: " + get_patch_message("sklearn_after_daal")) + "kneighbors: " + get_patch_message("sklearn_after_daal") + ) result = stock_fit(self, X, y) else: result = stock_fit(self, X, y) @@ -409,11 +453,9 @@ def stock_fit(self, X, y): class KNeighborsMixin(BaseKNeighborsMixin): def kneighbors(self, X=None, n_neighbors=None, return_distance=True): - daal_model = getattr(self, '_daal_model', None) + daal_model = getattr(self, "_daal_model", None) if X is not None and self.metric != "precomputed": - X = check_array( - X, accept_sparse='csr', dtype=[ - np.float64, np.float32]) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) x = self._fit_X if X is None else X try: fptype = getFPType(x) @@ -421,44 +463,58 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): fptype = None _patching_status = PatchingConditionsChain( - "sklearn.neighbors.KNeighborsMixin.kneighbors") - _dal_ready = _patching_status.and_conditions([ - (daal_model is not None, "oneDAL model was not trained."), - (fptype is not None, "Unable to get dtype."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported.")]) + "sklearn.neighbors.KNeighborsMixin.kneighbors" + ) + _dal_ready = _patching_status.and_conditions( + [ + (daal_model is not None, "oneDAL model was not trained."), + (fptype is not None, "Unable to get dtype."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ] + ) _patching_status.write_log() if _dal_ready: result = daal4py_kneighbors(self, X, n_neighbors, return_distance) else: - if daal_model is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + if ( + daal_model is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - BaseNeighborsBase._fit(self, self._fit_X, getattr(self, '_y', None)) + BaseNeighborsBase._fit(self, self._fit_X, getattr(self, "_y", None)) else: BaseNeighborsBase._fit(self, self._fit_X) result = super(KNeighborsMixin, self).kneighbors( - X, n_neighbors, return_distance) + X, n_neighbors, return_distance + ) return result class RadiusNeighborsMixin(BaseRadiusNeighborsMixin): - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - daal_model = getattr(self, '_daal_model', None) - - if daal_model is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + daal_model = getattr(self, "_daal_model", None) + + if ( + daal_model is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - BaseNeighborsBase._fit(self, self._fit_X, getattr(self, '_y', None)) + BaseNeighborsBase._fit(self, self._fit_X, getattr(self, "_y", None)) else: BaseNeighborsBase._fit(self, self._fit_X) if sklearn_check_version("0.22"): result = BaseRadiusNeighborsMixin.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = BaseRadiusNeighborsMixin.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result diff --git a/daal4py/sklearn/neighbors/_classification.py b/daal4py/sklearn/neighbors/_classification.py index 73edeae401..75ca98d9a9 100644 --- a/daal4py/sklearn/neighbors/_classification.py +++ b/daal4py/sklearn/neighbors/_classification.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,48 +12,52 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN classification scikit-learn-compatible classes -from ._base import NeighborsBase, KNeighborsMixin -from ._base import parse_auto_method, prediction_algorithm -from sklearn.base import ClassifierMixin as BaseClassifierMixin -from .._utils import ( - getFPType, - sklearn_check_version, - PatchingConditionsChain) -from .._device_offload import support_usm_ndarray -from sklearn.utils.validation import check_array import numpy as np from scipy import sparse as sp +from sklearn.base import ClassifierMixin as BaseClassifierMixin +from sklearn.utils.validation import check_array + +from .._device_offload import support_usm_ndarray +from .._utils import PatchingConditionsChain, getFPType, sklearn_check_version +from ._base import KNeighborsMixin, NeighborsBase, parse_auto_method, prediction_algorithm if sklearn_check_version("0.22"): - from sklearn.neighbors._classification import KNeighborsClassifier as \ - BaseKNeighborsClassifier + from sklearn.neighbors._classification import ( + KNeighborsClassifier as BaseKNeighborsClassifier, + ) + if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights from sklearn.utils.validation import _deprecate_positional_args else: - from sklearn.neighbors.classification import KNeighborsClassifier as \ - BaseKNeighborsClassifier from sklearn.neighbors.base import _check_weights + from sklearn.neighbors.classification import ( + KNeighborsClassifier as BaseKNeighborsClassifier, + ) def _deprecate_positional_args(f): return f def daal4py_classifier_predict(estimator, X, base_predict): - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): estimator._check_feature_names(X, reset=False) - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) - daal_model = getattr(estimator, '_daal_model', None) - n_features = getattr(estimator, 'n_features_in_', None) - shape = getattr(X, 'shape', None) + X = check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + daal_model = getattr(estimator, "_daal_model", None) + n_features = getattr(estimator, "n_features_in_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but KNNClassifier is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) try: fptype = getFPType(X) @@ -61,30 +65,37 @@ def daal4py_classifier_predict(estimator, X, base_predict): fptype = None _patching_status = PatchingConditionsChain( - "sklearn.neighbors.KNeighborsClassifier.predict") - _dal_ready = _patching_status.and_conditions([ - (daal_model is not None, "oneDAL model was not trained."), - (fptype is not None, "Unable to get dtype."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported.")]) + "sklearn.neighbors.KNeighborsClassifier.predict" + ) + _dal_ready = _patching_status.and_conditions( + [ + (daal_model is not None, "oneDAL model was not trained."), + (fptype is not None, "Unable to get dtype."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ] + ) _patching_status.write_log() if _dal_ready: params = { - 'method': 'defaultDense', - 'k': estimator.n_neighbors, - 'nClasses': len(estimator.classes_), - 'voteWeights': 'voteUniform' - if estimator.weights == 'uniform' else 'voteDistance', - 'resultsToEvaluate': 'computeClassLabels', - 'resultsToCompute': '' + "method": "defaultDense", + "k": estimator.n_neighbors, + "nClasses": len(estimator.classes_), + "voteWeights": "voteUniform" + if estimator.weights == "uniform" + else "voteDistance", + "resultsToEvaluate": "computeClassLabels", + "resultsToCompute": "", } method = parse_auto_method( - estimator, estimator.algorithm, estimator.n_samples_fit_, n_features) + estimator, estimator.algorithm, estimator.n_samples_fit_, n_features + ) predict_alg = prediction_algorithm(method, fptype, params) prediction_result = predict_alg.compute(X, daal_model) result = estimator.classes_.take( - np.asarray(prediction_result.prediction.ravel(), dtype=np.intp)) + np.asarray(prediction_result.prediction.ravel(), dtype=np.intp) + ) else: result = base_predict(estimator, X) @@ -92,55 +103,102 @@ def daal4py_classifier_predict(estimator, X, base_predict): if sklearn_check_version("0.24"): + class KNeighborsClassifier_(KNeighborsMixin, BaseClassifierMixin, NeighborsBase): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) - self.weights = \ + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( weights if sklearn_check_version("1.0") else _check_weights(weights) + ) + elif sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors._base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(NeighborsBase, KNeighborsMixin, - BaseSupervisedIntegerMixin, BaseClassifierMixin): + class KNeighborsClassifier_( + NeighborsBase, KNeighborsMixin, BaseSupervisedIntegerMixin, BaseClassifierMixin + ): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) + else: - from sklearn.neighbors.base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors.base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(NeighborsBase, KNeighborsMixin, - BaseSupervisedIntegerMixin, BaseClassifierMixin): + class KNeighborsClassifier_( + NeighborsBase, KNeighborsMixin, BaseSupervisedIntegerMixin, BaseClassifierMixin + ): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) @@ -148,17 +206,30 @@ class KNeighborsClassifier(KNeighborsClassifier_): __doc__ = BaseKNeighborsClassifier.__doc__ @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) @support_usm_ndarray() def fit(self, X, y): @@ -217,6 +288,6 @@ def predict_proba(self, X): The class probabilities of the input samples. Classes are ordered by lexicographic order. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) return BaseKNeighborsClassifier.predict_proba(self, X) diff --git a/daal4py/sklearn/neighbors/_regression.py b/daal4py/sklearn/neighbors/_regression.py index a33d5d153a..d7efa48b69 100644 --- a/daal4py/sklearn/neighbors/_regression.py +++ b/daal4py/sklearn/neighbors/_regression.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,80 +12,131 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN regression scikit-learn-compatible classes -from ._base import NeighborsBase, KNeighborsMixin from sklearn.base import RegressorMixin -from .._utils import sklearn_check_version -from .._device_offload import support_usm_ndarray +from .._device_offload import support_usm_ndarray +from .._utils import sklearn_check_version +from ._base import KNeighborsMixin, NeighborsBase if sklearn_check_version("0.22"): - from sklearn.neighbors._regression import KNeighborsRegressor as \ - BaseKNeighborsRegressor + from sklearn.neighbors._regression import ( + KNeighborsRegressor as BaseKNeighborsRegressor, + ) + if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights from sklearn.utils.validation import _deprecate_positional_args else: - from sklearn.neighbors.regression import KNeighborsRegressor as \ - BaseKNeighborsRegressor from sklearn.neighbors.base import _check_weights + from sklearn.neighbors.regression import ( + KNeighborsRegressor as BaseKNeighborsRegressor, + ) def _deprecate_positional_args(f): return f if sklearn_check_version("0.24"): + class KNeighborsRegressor_(KNeighborsMixin, RegressorMixin, NeighborsBase): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, weights='uniform', - algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs, **kwargs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + **kwargs, + ) + else: if sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors._base import ( + SupervisedFloatMixin as BaseSupervisedFloatMixin, + ) else: - from sklearn.neighbors.base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors.base import ( + SupervisedFloatMixin as BaseSupervisedFloatMixin, + ) - class KNeighborsRegressor_(NeighborsBase, KNeighborsMixin, - BaseSupervisedFloatMixin, RegressorMixin): + class KNeighborsRegressor_( + NeighborsBase, KNeighborsMixin, BaseSupervisedFloatMixin, RegressorMixin + ): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, weights='uniform', - algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs, **kwargs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + **kwargs, + ) class KNeighborsRegressor(KNeighborsRegressor_): __doc__ = BaseKNeighborsRegressor.__doc__ @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, weights='uniform', - algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs, **kwargs) - self.weights = \ + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( weights if sklearn_check_version("1.0") else _check_weights(weights) + ) def _more_tags(self): return BaseKNeighborsRegressor._more_tags(self) @@ -127,6 +178,6 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int Target values. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) return BaseKNeighborsRegressor.predict(self, X) diff --git a/daal4py/sklearn/neighbors/_unsupervised.py b/daal4py/sklearn/neighbors/_unsupervised.py index 71e4839ac7..341dec73ab 100644 --- a/daal4py/sklearn/neighbors/_unsupervised.py +++ b/daal4py/sklearn/neighbors/_unsupervised.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,48 +12,77 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py KNN scikit-learn-compatible classes -from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin -from .._utils import sklearn_check_version from .._device_offload import support_usm_ndarray +from .._utils import sklearn_check_version +from ._base import KNeighborsMixin, NeighborsBase, RadiusNeighborsMixin if sklearn_check_version("0.22"): from sklearn.utils.validation import _deprecate_positional_args else: + def _deprecate_positional_args(f): return f if sklearn_check_version("0.22") and not sklearn_check_version("0.23"): + class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): - def __init__(self, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) @support_usm_ndarray() def fit(self, X, y=None): return NeighborsBase._fit(self, X) + else: + class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): @_deprecate_positional_args - def __init__(self, *, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + *, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) @support_usm_ndarray() def fit(self, X, y=None): diff --git a/daal4py/sklearn/neighbors/tests/test_kneighbors.py b/daal4py/sklearn/neighbors/tests/test_kneighbors.py index 3a3e77f6a7..47087d32ef 100644 --- a/daal4py/sklearn/neighbors/tests/test_kneighbors.py +++ b/daal4py/sklearn/neighbors/tests/test_kneighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,46 +12,52 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import pytest -from sklearn.neighbors \ - import KNeighborsClassifier as ScikitKNeighborsClassifier -from daal4py.sklearn.neighbors \ - import KNeighborsClassifier as DaalKNeighborsClassifier from sklearn.datasets import load_iris -from sklearn.metrics import (accuracy_score, log_loss, roc_auc_score) +from sklearn.metrics import accuracy_score, log_loss, roc_auc_score from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier as ScikitKNeighborsClassifier + from daal4py.sklearn._utils import daal_check_version +from daal4py.sklearn.neighbors import KNeighborsClassifier as DaalKNeighborsClassifier -DISTANCES = ['minkowski'] -ALGORITHMS = ['brute', 'kd_tree', 'auto'] -WEIGHTS = ['uniform', 'distance'] +DISTANCES = ["minkowski"] +ALGORITHMS = ["brute", "kd_tree", "auto"] +WEIGHTS = ["uniform", "distance"] KS = [1, 3, 7, 15, 31] N_TRIES = 10 -ACCURACY_RATIO = 1.0 if daal_check_version(((2020, 'P', 300))) else 0.9 +ACCURACY_RATIO = 1.0 if daal_check_version(((2020, "P", 300))) else 0.9 LOG_LOSS_RATIO = 1.02 ROC_AUC_RATIO = 0.999 IRIS = load_iris() def _test_determenistic(distance, algorithm, weight, k): - x_train, x_test, y_train, y_test = \ - train_test_split(IRIS.data, IRIS.target, - test_size=0.33, random_state=31) + x_train, x_test, y_train, y_test = train_test_split( + IRIS.data, IRIS.target, test_size=0.33, random_state=31 + ) alg_results = [] for _ in range(N_TRIES): # models - scikit_model = ScikitKNeighborsClassifier(n_neighbors=k, - weights=weight, - algorithm=algorithm, - leaf_size=30, p=2, - metric=distance) - daal_model = DaalKNeighborsClassifier(n_neighbors=k, weights=weight, - algorithm=algorithm, - leaf_size=30, p=2, - metric=distance) + scikit_model = ScikitKNeighborsClassifier( + n_neighbors=k, + weights=weight, + algorithm=algorithm, + leaf_size=30, + p=2, + metric=distance, + ) + daal_model = DaalKNeighborsClassifier( + n_neighbors=k, + weights=weight, + algorithm=algorithm, + leaf_size=30, + p=2, + metric=distance, + ) # training scikit_model.fit(x_train, y_train) daal_model.fit(x_train, y_train) @@ -65,8 +71,9 @@ def _test_determenistic(distance, algorithm, weight, k): scikit_accuracy = accuracy_score(y_test, scikit_predict) daal_accuracy = accuracy_score(y_test, daal_predict) ratio = daal_accuracy / scikit_accuracy - reason = ("kNN accuracy: scikit_accuracy={},daal_accuracy={}, ratio={}".format( - scikit_accuracy, daal_accuracy, ratio)) + reason = "kNN accuracy: scikit_accuracy={},daal_accuracy={}, ratio={}".format( + scikit_accuracy, daal_accuracy, ratio + ) assert ratio >= ACCURACY_RATIO, reason # predict proba @@ -77,29 +84,30 @@ def _test_determenistic(distance, algorithm, weight, k): daal_log_loss = log_loss(y_test, daal_predict_proba) ratio = daal_log_loss / scikit_log_loss reason = "kNN log_loss: scikit_log_loss={},daal_log_loss={}, ratio={}".format( - scikit_log_loss, daal_log_loss, ratio) + scikit_log_loss, daal_log_loss, ratio + ) assert ratio <= LOG_LOSS_RATIO, reason # ROC AUC - scikit_roc_auc = roc_auc_score( - y_test, scikit_predict_proba, multi_class='ovr') - daal_roc_auc = roc_auc_score( - y_test, daal_predict_proba, multi_class='ovr') + scikit_roc_auc = roc_auc_score(y_test, scikit_predict_proba, multi_class="ovr") + daal_roc_auc = roc_auc_score(y_test, daal_predict_proba, multi_class="ovr") ratio = daal_roc_auc / scikit_roc_auc reason = "kNN roc_auc: scikit_roc_auc={}, daal_roc_auc={}, ratio={}".format( - scikit_roc_auc, daal_roc_auc, ratio) + scikit_roc_auc, daal_roc_auc, ratio + ) assert ratio >= ROC_AUC_RATIO, reason for i in range(1, N_TRIES): for j, res in enumerate(alg_results[i]): - reason = 'Results are different between runs for {}, {}, {}, k={}'.format( - algorithm, weight, distance, k) + reason = "Results are different between runs for {}, {}, {}, k={}".format( + algorithm, weight, distance, k + ) assert (res == alg_results[0][j]).mean() == 1, reason -@pytest.mark.parametrize('distance', DISTANCES) -@pytest.mark.parametrize('algorithm', ALGORITHMS) -@pytest.mark.parametrize('weight', WEIGHTS) -@pytest.mark.parametrize('k', KS) +@pytest.mark.parametrize("distance", DISTANCES) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("weight", WEIGHTS) +@pytest.mark.parametrize("k", KS) def test_determenistic(distance, algorithm, weight, k): _test_determenistic(distance, algorithm, weight, k) diff --git a/daal4py/sklearn/svm/__init__.py b/daal4py/sklearn/svm/__init__.py index c0765729b5..8002e241f9 100644 --- a/daal4py/sklearn/svm/__init__.py +++ b/daal4py/sklearn/svm/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .svm import SVC -__all__ = ['SVC'] +__all__ = ["SVC"] diff --git a/daal4py/sklearn/svm/_svm_0_22.py b/daal4py/sklearn/svm/_svm_0_22.py index 7501e4f900..4b131061ec 100644 --- a/daal4py/sklearn/svm/_svm_0_22.py +++ b/daal4py/sklearn/svm/_svm_0_22.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,30 +12,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from __future__ import print_function -import numpy as np +import warnings +import numpy as np +import sklearn.svm._base as svm_base +import sklearn.svm._classes as svm_classes from scipy import sparse as sp from sklearn.utils import check_random_state, check_X_y -from sklearn.utils.validation import check_is_fitted, _check_sample_weight - -import sklearn.svm._classes as svm_classes -import sklearn.svm._base as svm_base -import warnings +from sklearn.utils.validation import _check_sample_weight, check_is_fitted import daal4py -from .._utils import ( - make2d, - getFPType, - sklearn_check_version, - PatchingConditionsChain) + +from .._utils import PatchingConditionsChain, getFPType, make2d, sklearn_check_version def _get_libsvm_impl(): - return ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr'] + return ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"] def _dual_coef_getter(self): @@ -48,17 +44,17 @@ def _intercept_getter(self): def _dual_coef_setter(self, val): self._internal_dual_coef_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False def _intercept_setter(self, val): self._internal_intercept_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False @@ -67,8 +63,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): sv_ind_counters = np.zeros(num_classes, dtype=np.intp) num_of_sv_per_class = np.bincount(labels[np.hstack(sv_ind_by_clf)]) - sv_ind_by_class = [np.empty(n, dtype=np.int32) - for n in num_of_sv_per_class] + sv_ind_by_class = [np.empty(n, dtype=np.int32) for n in num_of_sv_per_class] for indices_per_clf in sv_ind_by_clf: for sv_index in indices_per_clf: @@ -82,6 +77,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): def map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class): from collections import defaultdict + sv_ind_mapping = defaultdict(lambda: -1) p = 0 for indices_per_class in sv_ind_by_class: @@ -98,13 +94,10 @@ def map_to_lexicographic(n): Returns permutation of reverse lexicographics to lexicographics orders for pairs of n consecutive integer indexes """ - from itertools import (combinations, count) + from itertools import combinations, count + two_class_order_gen = ((j, i) for i in range(n) for j in range(i)) - reverse_lookup = { - key: val for key, - val in zip( - two_class_order_gen, - count(0))} + reverse_lookup = {key: val for key, val in zip(two_class_order_gen, count(0))} perm_iter = (reverse_lookup[pair] for pair in combinations(range(n), 2)) return np.fromiter(perm_iter, dtype=np.intp) @@ -119,8 +112,7 @@ def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): Construct dual coefficients array in SKLearn peculiar layout, as well corresponding support vector indexes """ - sv_ind_by_class = group_indices_by_class( - num_classes, sv_ind_by_clf, labels) + sv_ind_by_class = group_indices_by_class(num_classes, sv_ind_by_clf, labels) sv_ind_mapping = map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class) num_unique_sv = len(sv_ind_mapping) @@ -150,14 +142,15 @@ def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): def _daal4py_kf(kernel, X_fptype, gamma=1.0): - if kernel == 'rbf': + if kernel == "rbf": sigma_value = np.sqrt(0.5 / gamma) kf = daal4py.kernel_function_rbf(fptype=X_fptype, sigma=sigma_value) - elif kernel == 'linear': + elif kernel == "linear": kf = daal4py.kernel_function_linear(fptype=X_fptype) else: raise ValueError( - "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel)) + "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel) + ) return kf @@ -167,13 +160,13 @@ def _daal4py_check_weight(self, X, y, sample_weight): if sample_weight.shape[0] > 0: sample_weight = _check_sample_weight(sample_weight, X) if np.all(sample_weight <= 0): - raise ValueError( - 'Invalid input - all samples have zero or negative weights.') + raise ValueError("Invalid input - all samples have zero or negative weights.") if np.any(sample_weight <= 0): if len(np.unique(y[sample_weight > 0])) != len(self.classes_): raise ValueError( - 'Invalid input - all samples with positive weights' - ' have the same label.') + "Invalid input - all samples with positive weights" + " have the same label." + ) ww = sample_weight elif self.class_weight is not None: ww = np.ones(X.shape[0], dtype=np.float64) @@ -185,10 +178,19 @@ def _daal4py_check_weight(self, X, y, sample_weight): return ww -def _daal4py_svm(fptype, C, accuracyThreshold, tau, - maxIterations, cacheSize, doShrinking, kernel, nClasses=2): +def _daal4py_svm( + fptype, + C, + accuracyThreshold, + tau, + maxIterations, + cacheSize, + doShrinking, + kernel, + nClasses=2, +): svm_train = daal4py.svm_training( - method='thunder', + method="thunder", fptype=fptype, C=C, accuracyThreshold=accuracyThreshold, @@ -196,7 +198,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, maxIterations=maxIterations, cacheSize=cacheSize, doShrinking=doShrinking, - kernel=kernel + kernel=kernel, ) if nClasses == 2: algo = svm_train @@ -204,7 +206,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, algo = daal4py.multi_class_classifier_training( nClasses=nClasses, fptype=fptype, - method='oneAgainstOne', + method="oneAgainstOne", training=svm_train, ) @@ -212,7 +214,6 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, def _daal4py_fit(self, X, y_inp, sample_weight, kernel): - if self.C <= 0: raise ValueError("C <= 0") num_classes = len(self.classes_) @@ -230,16 +231,17 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): X_fptype = getFPType(X) kf = _daal4py_kf(kernel, X_fptype, gamma=self._gamma) - algo = _daal4py_svm(fptype=X_fptype, - C=float(self.C), - accuracyThreshold=float(self.tol), - tau=1e-12, - maxIterations=int( - self.max_iter if self.max_iter > 0 else 2**30), - cacheSize=int(self.cache_size * 1024 * 1024), - doShrinking=bool(self.shrinking), - kernel=kf, - nClasses=num_classes) + algo = _daal4py_svm( + fptype=X_fptype, + C=float(self.C), + accuracyThreshold=float(self.tol), + tau=1e-12, + maxIterations=int(self.max_iter if self.max_iter > 0 else 2**30), + cacheSize=int(self.cache_size * 1024 * 1024), + doShrinking=bool(self.shrinking), + kernel=kf, + nClasses=num_classes, + ) res = algo.compute(data=X, labels=y, weights=ww) model = res.model @@ -252,11 +254,13 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): # support indexes need permutation to arrange them into the same layout # as that of Scikit-Learn - tmp = np.empty(two_class_sv_ind_.shape, dtype=np.dtype( - [('label', y.dtype), ('ind', two_class_sv_ind_.dtype)])) - tmp['label'][:] = y[two_class_sv_ind_].ravel() - tmp['ind'][:] = two_class_sv_ind_ - perm = np.argsort(tmp, order=['label', 'ind']) + tmp = np.empty( + two_class_sv_ind_.shape, + dtype=np.dtype([("label", y.dtype), ("ind", two_class_sv_ind_.dtype)]), + ) + tmp["label"][:] = y[two_class_sv_ind_].ravel() + tmp["ind"][:] = two_class_sv_ind_ + perm = np.argsort(tmp, order=["label", "ind"]) del tmp self.support_ = two_class_sv_ind_[perm] @@ -284,10 +288,9 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): two_class_sv_ind_ = svm_model.SupportIndices # Map these indexes to indexes of the training data sv_ind = np.take( - np.hstack( - (label_indexes[i1], - label_indexes[i2])), - two_class_sv_ind_.ravel()) + np.hstack((label_indexes[i1], label_indexes[i2])), + two_class_sv_ind_.ravel(), + ) sv_ind_by_clf.append(sv_ind) # svs_ = getArrayFromNumericTable(svm_model.getSupportVectors()) @@ -304,10 +307,10 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): intercepts = permute_list(intercepts, to_lex_perm) self.dual_coef_, self.support_ = extract_dual_coef( - num_classes, # number of classes + num_classes, # number of classes sv_ind_by_clf, # support vector indexes by two-class classifiers sv_coef_by_clf, # classification coefficients by two-class classifiers - y.squeeze().astype(np.intp, copy=False) # integer labels + y.squeeze().astype(np.intp, copy=False), # integer labels ) self.support_vectors_ = X[self.support_] self.intercept_ = np.array(intercepts) @@ -315,10 +318,12 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): indices = y.take(self.support_, axis=0) if num_classes == 2: self._n_support = np.array( - [np.sum(indices == -1), np.sum(indices == 1)], dtype=np.int32) + [np.sum(indices == -1), np.sum(indices == 1)], dtype=np.int32 + ) else: self._n_support = np.array( - [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32) + [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32 + ) try: self.probA_ = np.empty(0) @@ -329,8 +334,7 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel): self._probB = np.empty(0) -def __compute_gamma__(gamma, kernel, X, sparse, - use_var=True, deprecation=True): +def __compute_gamma__(gamma, kernel, X, sparse, use_var=True, deprecation=True): """ Computes actual value of 'gamma' parameter of RBF kernel corresponding to SVC keyword values `gamma` and `kernel`, and feature @@ -343,23 +347,25 @@ def __compute_gamma__(gamma, kernel, X, sparse, See: https://github.com/scikit-learn/scikit-learn/pull/13221 """ if deprecation: - _gamma_is_scale = gamma in ('scale', 'auto_deprecated') + _gamma_is_scale = gamma in ("scale", "auto_deprecated") else: - _gamma_is_scale = (gamma == 'scale') + _gamma_is_scale = gamma == "scale" if _gamma_is_scale: - kernel_uses_gamma = (not callable(kernel) and kernel - not in ('linear', 'precomputed')) + kernel_uses_gamma = not callable(kernel) and kernel not in ( + "linear", + "precomputed", + ) if kernel_uses_gamma: if sparse: # var = E[X^2] - E[X]^2 - X_sc = (X.multiply(X)).mean() - (X.mean())**2 + X_sc = (X.multiply(X)).mean() - (X.mean()) ** 2 else: X_sc = X.var() if not use_var: X_sc = np.sqrt(X_sc) else: X_sc = 1.0 / X.shape[1] - if gamma == 'scale': + if gamma == "scale": if X_sc != 0: _gamma = 1.0 / (X.shape[1] * X_sc) else: @@ -370,13 +376,16 @@ def __compute_gamma__(gamma, kernel, X, sparse, # setting `gamma` in examples (also in tests). See # https://github.com/scikit-learn/scikit-learn/pull/10331 # for the examples/tests that need to be reverted. - warnings.warn("The default value of gamma will change " - "from 'auto' to 'scale' in version 0.22 to " - "account better for unscaled features. Set " - "gamma explicitly to 'auto' or 'scale' to " - "avoid this warning.", FutureWarning) + warnings.warn( + "The default value of gamma will change " + "from 'auto' to 'scale' in version 0.22 to " + "account better for unscaled features. Set " + "gamma explicitly to 'auto' or 'scale' to " + "avoid this warning.", + FutureWarning, + ) _gamma = 1.0 / X.shape[1] - elif gamma == 'auto': + elif gamma == "auto": _gamma = 1.0 / X.shape[1] elif isinstance(gamma, str) and not deprecation: raise ValueError( @@ -393,7 +402,8 @@ def _compute_gamma(*args): no_older_than_0_20_3 = sklearn_check_version("0.20.3") no_older_than_0_22 = not sklearn_check_version("0.22") return __compute_gamma__( - *args, use_var=no_older_than_0_20_3, deprecation=no_older_than_0_22) + *args, use_var=no_older_than_0_20_3, deprecation=no_older_than_0_22 + ) def fit(self, X, y, sample_weight=None): @@ -434,54 +444,60 @@ def fit(self, X, y, sample_weight=None): raise TypeError("Sparse precomputed kernels are not supported.") self._sparse = sparse and not callable(self.kernel) - X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = check_X_y( + X, y, dtype=np.float64, order="C", accept_sparse="csr", accept_large_sparse=False + ) y = self._validate_targets(y) - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=np.float64) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) solver_type = _get_libsvm_impl().index(self._impl) # input validation if solver_type != 2 and X.shape[0] != y.shape[0]: raise ValueError( "X and y have incompatible shapes.\n" - "X has %s samples, but y has %s." % (X.shape[0], y.shape[0])) + "X has %s samples, but y has %s." % (X.shape[0], y.shape[0]) + ) if self.kernel == "precomputed" and X.shape[0] != X.shape[1]: raise ValueError("X.shape[0] should be equal to X.shape[1]") if sample_weight.shape[0] > 0 and sample_weight.shape[0] != X.shape[0]: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (sample_weight.shape, X.shape)) + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." % (sample_weight.shape, X.shape) + ) self._gamma = _compute_gamma(self.gamma, self.kernel, X, sparse) kernel = self.kernel if callable(kernel): - kernel = 'precomputed' + kernel = "precomputed" fit = self._sparse_fit if self._sparse else self._dense_fit if self.verbose: # pragma: no cover - print('[LibSVM]', end='') + print("[LibSVM]", end="") # see comment on the other call to np.iinfo in this file - seed = rnd.randint(np.iinfo('i').max) - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.fit") - _dal_ready = _patching_status.and_conditions([ - (not sparse, "X is sparse. Sparse input is not supported."), - (not self.probability, "Probabilities are not supported."), - (not getattr(self, 'break_ties', False), "Breaking ties is not supported."), - (kernel in ['linear', 'rbf'], - f"'{kernel}' kernel is not supported. " - "Only 'linear' and 'rbf' kernels are supported.") - ]) + seed = rnd.randint(np.iinfo("i").max) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.fit") + _dal_ready = _patching_status.and_conditions( + [ + (not sparse, "X is sparse. Sparse input is not supported."), + (not self.probability, "Probabilities are not supported."), + (not getattr(self, "break_ties", False), "Breaking ties is not supported."), + ( + kernel in ["linear", "rbf"], + f"'{kernel}' kernel is not supported. " + "Only 'linear' and 'rbf' kernels are supported.", + ), + ] + ) _patching_status.write_log() if _dal_ready: self._daal_fit = True @@ -505,8 +521,11 @@ def fit(self, X, y, sample_weight=None): self._internal_dual_coef_ *= -1 self._internal_intercept_ *= -1 - if not self._daal_fit and len(self.classes_) == 2 and self._impl in [ - 'c_svc', 'nu_svc']: + if ( + not self._daal_fit + and len(self.classes_) == 2 + and self._impl in ["c_svc", "nu_svc"] + ): self.intercept_ *= -1 self.dual_coef_ *= -1 @@ -520,9 +539,7 @@ def _daal4py_predict(self, X): kf = _daal4py_kf(self.kernel, X_fptype, gamma=self._gamma) svm_predict = daal4py.svm_prediction( - fptype=X_fptype, - method='defaultDense', - kernel=kf + fptype=X_fptype, method="defaultDense", kernel=kf ) if num_classes == 2: alg = svm_predict @@ -533,8 +550,8 @@ def _daal4py_predict(self, X): maxIterations=int(self.max_iter if self.max_iter > 0 else 2**30), accuracyThreshold=float(self.tol), pmethod="voteBased", - tmethod='oneAgainstOne', - prediction=svm_predict + tmethod="oneAgainstOne", + prediction=svm_predict, ) predictionRes = alg.compute(X, self.daal_model_) @@ -566,27 +583,37 @@ def predict(self, X): y_pred : array, shape (n_samples,) """ check_is_fitted(self) - _break_ties = getattr(self, 'break_ties', False) - if _break_ties and self.decision_function_shape == 'ovo': - raise ValueError("break_ties must be False when " - "decision_function_shape is 'ovo'") - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.predict") - _dal_ready = _patching_status.and_conditions([ - (not _break_ties, "Breaking ties is not supported."), - (self.decision_function_shape != 'ovr', - "'ovr' decision function shape is not supported."), - (len(self.classes_) <= 2, "Number of classes > 2.") - ], conditions_merging=any) + _break_ties = getattr(self, "break_ties", False) + if _break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when " "decision_function_shape is 'ovo'" + ) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.predict") + _dal_ready = _patching_status.and_conditions( + [ + (not _break_ties, "Breaking ties is not supported."), + ( + self.decision_function_shape != "ovr", + "'ovr' decision function shape is not supported.", + ), + (len(self.classes_) <= 2, "Number of classes > 2."), + ], + conditions_merging=any, + ) _patching_status.write_log() if not _dal_ready: y = np.argmax(self.decision_function(X), axis=1) else: X = self._validate_for_predict(X) - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False) and hasattr(self, 'daal_model_'), - "oneDAL model was not trained.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + getattr(self, "_daal_fit", False) and hasattr(self, "daal_model_"), + "oneDAL model was not trained.", + ) + ] + ) if _dal_ready: y = _daal4py_predict(self, X) else: @@ -612,40 +639,88 @@ def predict(self, X): del __base_svc_init_function_code__ -if 'break_ties' in __base_svc_init_arg_names__: - class SVC(svm_base.BaseSVC): - _impl = 'c_svc' - - def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): +if "break_ties" in __base_svc_init_arg_names__: - super(SVC, self).__init__( - kernel=kernel, degree=degree, gamma=gamma, - coef0=coef0, tol=tol, C=C, nu=0., shrinking=shrinking, - probability=probability, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, - decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) -else: class SVC(svm_base.BaseSVC): - _impl = 'c_svc' + _impl = "c_svc" + + def __init__( + self, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): + super(SVC, self).__init__( + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) - def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - random_state=None): +else: + class SVC(svm_base.BaseSVC): + _impl = "c_svc" + + def __init__( + self, + C=1.0, + kernel="rbf", + degree=3, + gamma="auto_deprecated", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + random_state=None, + ): super(SVC, self).__init__( - kernel=kernel, degree=degree, gamma=gamma, - coef0=coef0, tol=tol, C=C, nu=0., shrinking=shrinking, - probability=probability, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=0.0, + shrinking=shrinking, + probability=probability, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, decision_function_shape=decision_function_shape, - random_state=random_state) + random_state=random_state, + ) + SVC.fit = fit SVC.predict = predict diff --git a/daal4py/sklearn/svm/_svm_0_23.py b/daal4py/sklearn/svm/_svm_0_23.py index 92de19688c..96026668ee 100755 --- a/daal4py/sklearn/svm/_svm_0_23.py +++ b/daal4py/sklearn/svm/_svm_0_23.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,40 +12,42 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from __future__ import print_function -import numpy as np +import warnings +import numpy as np +import sklearn.svm._base as svm_base +import sklearn.svm._classes as svm_classes from scipy import sparse as sp from sklearn.calibration import CalibratedClassifierCV +from sklearn.exceptions import NotFittedError +from sklearn.model_selection import StratifiedKFold from sklearn.utils import check_random_state +from sklearn.utils.multiclass import _ovr_decision_function from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, + _check_sample_weight, _num_samples, - _check_sample_weight) -import sklearn.svm._classes as svm_classes -import sklearn.svm._base as svm_base -import warnings -from sklearn.exceptions import NotFittedError -from sklearn.utils.multiclass import _ovr_decision_function -from sklearn.model_selection import StratifiedKFold + check_consistent_length, + check_is_fitted, +) try: from packaging.version import Version except ImportError: from distutils.version import LooseVersion as Version + from sklearn import __version__ as sklearn_version import daal4py -from .._utils import ( - make2d, getFPType, sklearn_check_version, PatchingConditionsChain) + +from .._utils import PatchingConditionsChain, getFPType, make2d, sklearn_check_version def _get_libsvm_impl(): - return ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr'] + return ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"] def _dual_coef_getter(self): @@ -58,17 +60,17 @@ def _intercept_getter(self): def _dual_coef_setter(self, val): self._internal_dual_coef_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False def _intercept_setter(self, val): self._internal_intercept_ = val - if hasattr(self, 'daal_model_'): + if hasattr(self, "daal_model_"): del self.daal_model_ - if getattr(self, '_daal_fit', False): + if getattr(self, "_daal_fit", False): self._daal_fit = False @@ -77,8 +79,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): sv_ind_counters = np.zeros(num_classes, dtype=np.intp) num_of_sv_per_class = np.bincount(labels[np.hstack(sv_ind_by_clf)]) - sv_ind_by_class = [np.empty(n, dtype=np.int32) - for n in num_of_sv_per_class] + sv_ind_by_class = [np.empty(n, dtype=np.int32) for n in num_of_sv_per_class] for indices_per_clf in sv_ind_by_clf: for sv_index in indices_per_clf: @@ -92,6 +93,7 @@ def group_indices_by_class(num_classes, sv_ind_by_clf, labels): def map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class): from collections import defaultdict + sv_ind_mapping = defaultdict(lambda: -1) p = 0 for indices_per_class in sv_ind_by_class: @@ -104,13 +106,13 @@ def map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class): def map_to_lexicographic(n): - """ Returns permutation of reverse lexicographics to + """Returns permutation of reverse lexicographics to lexicographics orders for pairs of n consecutive integer indexes """ - from itertools import (combinations, count) + from itertools import combinations, count + two_class_order_gen = ((j, i) for i in range(n) for j in range(i)) - reverse_lookup = {key: val for key, - val in zip(two_class_order_gen, count(0))} + reverse_lookup = {key: val for key, val in zip(two_class_order_gen, count(0))} perm_iter = (reverse_lookup[pair] for pair in combinations(range(n), 2)) return np.fromiter(perm_iter, dtype=np.intp) @@ -121,11 +123,10 @@ def permute_list(li, perm): def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): - """ Construct dual coefficients array in SKLearn peculiar layout, + """Construct dual coefficients array in SKLearn peculiar layout, as well corresponding support vector indexes """ - sv_ind_by_class = group_indices_by_class( - num_classes, sv_ind_by_clf, labels) + sv_ind_by_class = group_indices_by_class(num_classes, sv_ind_by_clf, labels) sv_ind_mapping = map_sv_to_columns_in_dual_coef_matrix(sv_ind_by_class) num_unique_sv = len(sv_ind_mapping) @@ -156,15 +157,17 @@ def extract_dual_coef(num_classes, sv_ind_by_clf, sv_coef_by_clf, labels): def _daal4py_kf(kernel, X_fptype, gamma=1.0, is_sparse=False): method = "fastCSR" if is_sparse else "defaultDense" - if kernel == 'rbf': + if kernel == "rbf": sigma_value = np.sqrt(0.5 / gamma) kf = daal4py.kernel_function_rbf( - fptype=X_fptype, method=method, sigma=sigma_value) - elif kernel == 'linear': + fptype=X_fptype, method=method, sigma=sigma_value + ) + elif kernel == "linear": kf = daal4py.kernel_function_linear(fptype=X_fptype, method=method) else: raise ValueError( - "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel)) + "_daal4py_fit received unexpected kernel specifiction {}.".format(kernel) + ) return kf @@ -174,13 +177,13 @@ def _daal4py_check_weight(self, X, y, sample_weight): if sample_weight.shape[0] > 0: sample_weight = _check_sample_weight(sample_weight, X) if np.all(sample_weight <= 0): - raise ValueError( - 'Invalid input - all samples have zero or negative weights.') + raise ValueError("Invalid input - all samples have zero or negative weights.") if np.any(sample_weight <= 0): if len(np.unique(y[sample_weight > 0])) != len(self.classes_): raise ValueError( - 'Invalid input - all samples with positive weights ' - 'have the same label.') + "Invalid input - all samples with positive weights " + "have the same label." + ) ww = sample_weight elif self.class_weight is not None: ww = np.ones(X.shape[0], dtype=np.float64) @@ -190,10 +193,19 @@ def _daal4py_check_weight(self, X, y, sample_weight): return ww -def _daal4py_svm(fptype, C, accuracyThreshold, tau, - maxIterations, cacheSize, doShrinking, kernel, nClasses=2): +def _daal4py_svm( + fptype, + C, + accuracyThreshold, + tau, + maxIterations, + cacheSize, + doShrinking, + kernel, + nClasses=2, +): svm_train = daal4py.svm_training( - method='thunder', + method="thunder", fptype=fptype, C=C, accuracyThreshold=accuracyThreshold, @@ -201,7 +213,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, maxIterations=maxIterations, cacheSize=cacheSize, doShrinking=doShrinking, - kernel=kernel + kernel=kernel, ) if nClasses == 2: algo = svm_train @@ -209,7 +221,7 @@ def _daal4py_svm(fptype, C, accuracyThreshold, tau, algo = daal4py.multi_class_classifier_training( nClasses=nClasses, fptype=fptype, - method='oneAgainstOne', + method="oneAgainstOne", training=svm_train, ) @@ -227,17 +239,17 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): y = make2d(y_inp) X_fptype = getFPType(X) kf = _daal4py_kf(kernel, X_fptype, gamma=self._gamma, is_sparse=is_sparse) - algo = _daal4py_svm(fptype=X_fptype, - C=float(self.C), - accuracyThreshold=float(self.tol), - tau=1e-12, - maxIterations=int( - self.max_iter if self.max_iter > 0 else 2**30), - cacheSize=int( - self.cache_size * 1024 * 1024), - doShrinking=bool(self.shrinking), - kernel=kf, - nClasses=num_classes) + algo = _daal4py_svm( + fptype=X_fptype, + C=float(self.C), + accuracyThreshold=float(self.tol), + tau=1e-12, + maxIterations=int(self.max_iter if self.max_iter > 0 else 2**30), + cacheSize=int(self.cache_size * 1024 * 1024), + doShrinking=bool(self.shrinking), + kernel=kf, + nClasses=num_classes, + ) res = algo.compute(data=X, labels=y, weights=sample_weight) @@ -251,11 +263,13 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): # support indexes need permutation to arrange them # into the same layout as that of Scikit-Learn - tmp = np.empty(two_class_sv_ind_.shape, dtype=np.dtype( - [('label', y.dtype), ('ind', two_class_sv_ind_.dtype)])) - tmp['label'][:] = y[two_class_sv_ind_].ravel() - tmp['ind'][:] = two_class_sv_ind_ - perm = np.argsort(tmp, order=['label', 'ind']) + tmp = np.empty( + two_class_sv_ind_.shape, + dtype=np.dtype([("label", y.dtype), ("ind", two_class_sv_ind_.dtype)]), + ) + tmp["label"][:] = y[two_class_sv_ind_].ravel() + tmp["ind"][:] = two_class_sv_ind_ + perm = np.argsort(tmp, order=["label", "ind"]) del tmp self.support_ = two_class_sv_ind_[perm] @@ -285,10 +299,9 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): two_class_sv_ind_ = svm_model.SupportIndices # Map these indexes to indexes of the training data sv_ind = np.take( - np.hstack( - (label_indexes[i1], - label_indexes[i2])), - two_class_sv_ind_.ravel()) + np.hstack((label_indexes[i1], label_indexes[i2])), + two_class_sv_ind_.ravel(), + ) sv_ind_by_clf.append(sv_ind) # svs_ = getArrayFromNumericTable(svm_model.getSupportVectors()) @@ -305,10 +318,10 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): intercepts = permute_list(intercepts, to_lex_perm) self.dual_coef_, self.support_ = extract_dual_coef( - num_classes, # number of classes + num_classes, # number of classes sv_ind_by_clf, # support vector indexes by two-class classifiers sv_coef_by_clf, # classification coefficients by two-class classifiers - y.squeeze().astype(np.intp, copy=False) # integer labels + y.squeeze().astype(np.intp, copy=False), # integer labels ) if is_sparse: self.dual_coef_ = sp.csr_matrix(self.dual_coef_) @@ -317,7 +330,8 @@ def _daal4py_fit(self, X, y_inp, sample_weight, kernel, is_sparse=False): indices = y.take(self.support_, axis=0) self._n_support = np.array( - [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32) + [np.sum(indices == i) for i, c in enumerate(self.classes_)], dtype=np.int32 + ) self._probA = np.empty(0) self._probB = np.empty(0) @@ -336,23 +350,25 @@ def __compute_gamma__(gamma, kernel, X, use_var=True, deprecation=True): See: https://github.com/scikit-learn/scikit-learn/pull/13221 """ if deprecation: - _gamma_is_scale = gamma in ('scale', 'auto_deprecated') + _gamma_is_scale = gamma in ("scale", "auto_deprecated") else: - _gamma_is_scale = (gamma == 'scale') + _gamma_is_scale = gamma == "scale" if _gamma_is_scale: - kernel_uses_gamma = (not callable(kernel) and kernel - not in ('linear', 'precomputed')) + kernel_uses_gamma = not callable(kernel) and kernel not in ( + "linear", + "precomputed", + ) if kernel_uses_gamma: if sp.isspmatrix(X): # var = E[X^2] - E[X]^2 - X_sc = (X.multiply(X)).mean() - (X.mean())**2 + X_sc = (X.multiply(X)).mean() - (X.mean()) ** 2 else: X_sc = X.var() if not use_var: X_sc = np.sqrt(X_sc) else: X_sc = 1.0 / X.shape[1] - if gamma == 'scale': + if gamma == "scale": if X_sc != 0: _gamma = 1.0 / (X.shape[1] * X_sc) else: @@ -363,13 +379,16 @@ def __compute_gamma__(gamma, kernel, X, use_var=True, deprecation=True): # setting `gamma` in examples (also in tests). See # https://github.com/scikit-learn/scikit-learn/pull/10331 # for the examples/tests that need to be reverted. - warnings.warn("The default value of gamma will change " - "from 'auto' to 'scale' in version 0.22 to " - "account better for unscaled features. Set " - "gamma explicitly to 'auto' or 'scale' to " - "avoid this warning.", FutureWarning) + warnings.warn( + "The default value of gamma will change " + "from 'auto' to 'scale' in version 0.22 to " + "account better for unscaled features. Set " + "gamma explicitly to 'auto' or 'scale' to " + "avoid this warning.", + FutureWarning, + ) _gamma = 1.0 / X.shape[1] - elif gamma == 'auto': + elif gamma == "auto": _gamma = 1.0 / X.shape[1] elif isinstance(gamma, str) and not deprecation: raise ValueError( @@ -386,9 +405,8 @@ def _compute_gamma(*args): no_older_than_0_20_3 = sklearn_check_version("0.20.3") no_older_than_0_22 = not sklearn_check_version("0.22") return __compute_gamma__( - *args, - use_var=no_older_than_0_20_3, - deprecation=no_older_than_0_22) + *args, use_var=no_older_than_0_20_3, deprecation=no_older_than_0_22 + ) def fit(self, X, y, sample_weight=None): @@ -429,8 +447,8 @@ def fit(self, X, y, sample_weight=None): raise TypeError("Sparse precomputed kernels are not supported.") self._sparse = is_sparse and not callable(self.kernel) - if hasattr(self, 'decision_function_shape'): - if self.decision_function_shape not in ('ovr', 'ovo'): + if hasattr(self, "decision_function_shape"): + if self.decision_function_shape not in ("ovr", "ovo"): raise ValueError( f"decision_function_shape must be either 'ovr' or 'ovo', " f"got {self.decision_function_shape}." @@ -439,14 +457,19 @@ def fit(self, X, y, sample_weight=None): if callable(self.kernel): check_consistent_length(X, y) else: - X, y = self._validate_data(X, y, dtype=np.float64, - order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = self._validate_data( + X, + y, + dtype=np.float64, + order="C", + accept_sparse="csr", + accept_large_sparse=False, + ) y = self._validate_targets(y) - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=np.float64) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) solver_type = _get_libsvm_impl().index(self._impl) # input validation @@ -454,37 +477,43 @@ def fit(self, X, y, sample_weight=None): if solver_type != 2 and n_samples != y.shape[0]: raise ValueError( "X and y have incompatible shapes.\n" - "X has %s samples, but y has %s." % (n_samples, y.shape[0])) + "X has %s samples, but y has %s." % (n_samples, y.shape[0]) + ) if self.kernel == "precomputed" and n_samples != X.shape[1]: raise ValueError("X.shape[0] should be equal to X.shape[1]") if sample_weight.shape[0] > 0 and sample_weight.shape[0] != n_samples: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (sample_weight.shape, X.shape)) - - kernel = 'precomputed' if callable(self.kernel) else self.kernel - if kernel == 'precomputed': + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." % (sample_weight.shape, X.shape) + ) + + kernel = "precomputed" if callable(self.kernel) else self.kernel + if kernel == "precomputed": self._gamma = 0.0 else: self._gamma = _compute_gamma(self.gamma, kernel, X) fit = self._sparse_fit if self._sparse else self._dense_fit if self.verbose: # pragma: no cover - print('[LibSVM]', end='') + print("[LibSVM]", end="") # see comment on the other call to np.iinfo in this file - seed = rnd.randint(np.iinfo('i').max) - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.fit") - _dal_ready = _patching_status.and_conditions([ - (kernel in ['linear', 'rbf'], - f"'{kernel}' kernel is not supported. " - "Only 'linear' and 'rbf' kernels are supported.")]) + seed = rnd.randint(np.iinfo("i").max) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.fit") + _dal_ready = _patching_status.and_conditions( + [ + ( + kernel in ["linear", "rbf"], + f"'{kernel}' kernel is not supported. " + "Only 'linear' and 'rbf' kernels are supported.", + ) + ] + ) _patching_status.write_log() if _dal_ready: sample_weight = _daal4py_check_weight(self, X, y, sample_weight) @@ -496,36 +525,37 @@ def fit(self, X, y, sample_weight=None): if self.probability: params = self.get_params() params["probability"] = False - params["decision_function_shape"] = 'ovr' + params["decision_function_shape"] = "ovr" clf_base = SVC(**params) try: n_splits = 5 cv = StratifiedKFold( - n_splits=n_splits, - shuffle=True, - random_state=self.random_state) + n_splits=n_splits, shuffle=True, random_state=self.random_state + ) if Version(sklearn_version) >= Version("0.24"): self.clf_prob = CalibratedClassifierCV( - clf_base, ensemble=False, cv=cv, method='sigmoid', - n_jobs=n_splits) + clf_base, ensemble=False, cv=cv, method="sigmoid", n_jobs=n_splits + ) else: self.clf_prob = CalibratedClassifierCV( - clf_base, cv=cv, method='sigmoid') + clf_base, cv=cv, method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) except ValueError: clf_base = clf_base.fit(X, y, sample_weight) self.clf_prob = CalibratedClassifierCV( - clf_base, cv="prefit", method='sigmoid') + clf_base, cv="prefit", method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) else: self._daal_fit = False fit(X, y, sample_weight, solver_type, kernel, random_seed=seed) - self.shape_fit_ = X.shape if hasattr(X, "shape") else (n_samples, ) + self.shape_fit_ = X.shape if hasattr(X, "shape") else (n_samples,) # In binary case, we need to flip the sign of coef, intercept and # decision function. Use self._intercept_ and self._dual_coef_ internally. - if not getattr(self, '_daal_fit', False): + if not getattr(self, "_daal_fit", False): self._internal_intercept_ = self.intercept_.copy() self._internal_dual_coef_ = self.dual_coef_.copy() else: @@ -535,13 +565,11 @@ def fit(self, X, y, sample_weight=None): self._internal_dual_coef_ *= -1 self._internal_intercept_ *= -1 - if not getattr( - self, - '_daal_fit', - False) and len( - self.classes_) == 2 and self._impl in [ - 'c_svc', - 'nu_svc']: + if ( + not getattr(self, "_daal_fit", False) + and len(self.classes_) == 2 + and self._impl in ["c_svc", "nu_svc"] + ): self.intercept_ *= -1 self.dual_coef_ *= -1 @@ -552,26 +580,24 @@ def _daal4py_predict(self, X, is_decision_function=False): X_fptype = getFPType(X) num_classes = len(self.classes_) - kf = _daal4py_kf(self.kernel, X_fptype, gamma=self._gamma, - is_sparse=sp.isspmatrix(X)) + kf = _daal4py_kf(self.kernel, X_fptype, gamma=self._gamma, is_sparse=sp.isspmatrix(X)) svm_predict = daal4py.svm_prediction( - fptype=X_fptype, - method='defaultDense', - kernel=kf + fptype=X_fptype, method="defaultDense", kernel=kf ) if num_classes == 2: alg = svm_predict else: - result_to_compute = 'computeDecisionFunction' \ - if is_decision_function else 'computeClassLabels' + result_to_compute = ( + "computeDecisionFunction" if is_decision_function else "computeClassLabels" + ) alg = daal4py.multi_class_classifier_prediction( nClasses=num_classes, fptype=X_fptype, pmethod="voteBased", - tmethod='oneAgainstOne', + tmethod="oneAgainstOne", resultsToEvaluate=result_to_compute, - prediction=svm_predict + prediction=svm_predict, ) predictionRes = alg.compute(X, self.daal_model_) @@ -606,27 +632,37 @@ def predict(self, X): """ check_is_fitted(self) - _break_ties = getattr(self, 'break_ties', False) - if _break_ties and self.decision_function_shape == 'ovo': - raise ValueError("break_ties must be False when " - "decision_function_shape is 'ovo'") - - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.predict") - _dal_ready = _patching_status.and_conditions([ - (not _break_ties, "Breaking ties is not supported."), - (self.decision_function_shape != 'ovr', - "'ovr' decision function shape is not supported."), - (len(self.classes_) <= 2, "Number of classes > 2.") - ], conditions_merging=any) + _break_ties = getattr(self, "break_ties", False) + if _break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when " "decision_function_shape is 'ovo'" + ) + + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.predict") + _dal_ready = _patching_status.and_conditions( + [ + (not _break_ties, "Breaking ties is not supported."), + ( + self.decision_function_shape != "ovr", + "'ovr' decision function shape is not supported.", + ), + (len(self.classes_) <= 2, "Number of classes > 2."), + ], + conditions_merging=any, + ) _patching_status.write_log() if not _dal_ready: y = np.argmax(self.decision_function(X), axis=1) else: X = self._validate_for_predict(X) - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False) and hasattr(self, 'daal_model_'), - "oneDAL model was not trained.")]) + _dal_ready = _patching_status.and_conditions( + [ + ( + getattr(self, "_daal_fit", False) and hasattr(self, "daal_model_"), + "oneDAL model was not trained.", + ) + ] + ) if _dal_ready: if self.probability and self.clf_prob is not None: y = self.clf_prob.predict(X) @@ -642,9 +678,10 @@ def predict(self, X): def _daal4py_predict_proba(self, X): X = self._validate_for_predict(X) - if getattr(self, 'clf_prob', None) is None: + if getattr(self, "clf_prob", None) is None: raise NotFittedError( - "predict_proba is not available when fitted with probability=False") + "predict_proba is not available when fitted with probability=False" + ) prob = self.clf_prob.predict_proba(X) return prob @@ -678,10 +715,10 @@ def predict_proba(self): """ self._check_proba() - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.predict_proba") - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False), "oneDAL model was not trained.")]) + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.predict_proba") + _dal_ready = _patching_status.and_conditions( + [(getattr(self, "_daal_fit", False), "oneDAL model was not trained.")] + ) _patching_status.write_log() if _dal_ready: algo = self._daal4py_predict_proba @@ -717,17 +754,17 @@ def decision_function(self, X): transformation of ovo decision function. """ - _patching_status = PatchingConditionsChain( - "sklearn.svm.SVC.decision_function") - _dal_ready = _patching_status.and_conditions([ - (getattr(self, '_daal_fit', False), "oneDAL model was not trained.")]) + _patching_status = PatchingConditionsChain("sklearn.svm.SVC.decision_function") + _dal_ready = _patching_status.and_conditions( + [(getattr(self, "_daal_fit", False), "oneDAL model was not trained.")] + ) _patching_status.write_log() if _dal_ready: X = self._validate_for_predict(X) dec = _daal4py_predict(self, X, is_decision_function=True) else: dec = self._decision_function(X) - if self.decision_function_shape == 'ovr' and len(self.classes_) > 2: + if self.decision_function_shape == "ovr" and len(self.classes_) > 2: return _ovr_decision_function(dec < 0, -dec, len(self.classes_)) return dec @@ -749,14 +786,26 @@ def decision_function(self, X): class SVC(svm_base.BaseSVC): - _impl = 'c_svc' - - def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): - + _impl = "c_svc" + + def __init__( + self, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): super(SVC, self).__init__( kernel=kernel, degree=degree, @@ -764,7 +813,7 @@ def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=coef0, tol=tol, C=C, - nu=0., + nu=0.0, shrinking=shrinking, probability=probability, cache_size=cache_size, @@ -773,7 +822,8 @@ def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', max_iter=max_iter, decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) + random_state=random_state, + ) SVC.fit = fit diff --git a/daal4py/sklearn/svm/svm.py b/daal4py/sklearn/svm/svm.py index 14ca6f258e..fb15c62fdf 100644 --- a/daal4py/sklearn/svm/svm.py +++ b/daal4py/sklearn/svm/svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,11 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import sklearn_check_version -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from ._svm_0_23 import * -elif sklearn_check_version('0.22'): +elif sklearn_check_version("0.22"): from ._svm_0_22 import * diff --git a/daal4py/sklearn/test/test_common.py b/daal4py/sklearn/test/test_common.py index eb22ecda0b..b2b59c9651 100644 --- a/daal4py/sklearn/test/test_common.py +++ b/daal4py/sklearn/test/test_common.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,46 +12,45 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +import numpy as np import pandas as pd import pytest -import numpy as np -from daal4py.sklearn.ensemble \ - import RandomForestClassifier as DaalRandomForestClassifier -from daal4py.sklearn.ensemble \ - import RandomForestRegressor as DaalRandomForestRegressor -from daal4py.sklearn.neighbors import KNeighborsClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split +from daal4py.sklearn.ensemble import RandomForestClassifier as DaalRandomForestClassifier +from daal4py.sklearn.ensemble import RandomForestRegressor as DaalRandomForestRegressor +from daal4py.sklearn.neighbors import KNeighborsClassifier + -def convert_data(data, class_name=np.array, order='C', dtype=np.float64): - if order == 'C': +def convert_data(data, class_name=np.array, order="C", dtype=np.float64): + if order == "C": data = np.ascontiguousarray(data, dtype=dtype) else: data = np.asfortranarray(data, dtype=dtype) return class_name(data) -def make_dataset(n_samples=256, n_features=5, n_classes=2, - test_size=0.5, shuffle=True): - x, y = make_classification(n_samples=n_samples, n_features=n_features, - n_classes=n_classes, random_state=777) - return train_test_split(x, y, random_state=777, - test_size=test_size, shuffle=shuffle) +def make_dataset(n_samples=256, n_features=5, n_classes=2, test_size=0.5, shuffle=True): + x, y = make_classification( + n_samples=n_samples, n_features=n_features, n_classes=n_classes, random_state=777 + ) + return train_test_split(x, y, random_state=777, test_size=test_size, shuffle=shuffle) ESTIMATORS = { - 'KNeighborsClassifier': - KNeighborsClassifier(n_neighbors=10), - 'DaalRandomForestClassifier': - DaalRandomForestClassifier(n_estimators=10, random_state=777), - 'DaalRandomForestRegressor': - DaalRandomForestRegressor(n_estimators=10, random_state=777), + "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=10), + "DaalRandomForestClassifier": DaalRandomForestClassifier( + n_estimators=10, random_state=777 + ), + "DaalRandomForestRegressor": DaalRandomForestRegressor( + n_estimators=10, random_state=777 + ), } -ORDERS = ['C', 'F'] +ORDERS = ["C", "F"] DATA_FORMATS = [pd.DataFrame, np.array] @@ -70,10 +69,11 @@ def check_data_formats_diff(name): for i in range(1, len(alg_results)): for j, res in enumerate(alg_results[i]): - assert (res == alg_results[0][j]).mean() == 1, \ - ('Results are different between formats: estimator=%s' % (name)) + assert ( + res == alg_results[0][j] + ).mean() == 1, "Results are different between formats: estimator=%s" % (name) -@pytest.mark.parametrize('name', ESTIMATORS) +@pytest.mark.parametrize("name", ESTIMATORS) def test_data_formats_diff(name): check_data_formats_diff(name) diff --git a/daal4py/sklearn/tree/__init__.py b/daal4py/sklearn/tree/__init__.py index 95a262ed20..7ffdab77a7 100644 --- a/daal4py/sklearn/tree/__init__.py +++ b/daal4py/sklearn/tree/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .decision_tree import DecisionTreeClassifier -__all__ = ['DecisionTreeClassifier'] +__all__ = ["DecisionTreeClassifier"] diff --git a/daal4py/sklearn/tree/decision_tree.py b/daal4py/sklearn/tree/decision_tree.py index b20bc75b90..8ec6f73439 100644 --- a/daal4py/sklearn/tree/decision_tree.py +++ b/daal4py/sklearn/tree/decision_tree.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,23 +12,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== # daal4py DecisionTree scikit-learn-compatible estimator classes -import numpy as np import numbers import warnings + +import numpy as np +from scipy.sparse import issparse from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.exceptions import DataConversionWarning -from sklearn.utils.validation import ( - check_array, check_is_fitted, check_consistent_length -) from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_array, check_consistent_length, check_is_fitted + import daal4py as d4p -from .._utils import (make2d, getFPType) + from .._device_offload import support_usm_ndarray -from scipy.sparse import issparse +from .._utils import getFPType, make2d class DecisionTreeClassifier(BaseEstimator, ClassifierMixin): @@ -71,8 +72,10 @@ class DecisionTreeClassifier(BaseEstimator, ClassifierMixin): nBins is the number of bins used to compute probabilities of the observations belonging to the class. """ - def __init__(self, max_depth=None, min_observations_in_leaf_node=1, - split_criterion='gini'): + + def __init__( + self, max_depth=None, min_observations_in_leaf_node=1, split_criterion="gini" + ): self.max_depth = max_depth self.min_observations_in_leaf_node = min_observations_in_leaf_node self.split_criterion = split_criterion @@ -94,9 +97,11 @@ def _daal4py_fit(self, X, y, w, pruning_set=None): _pruning_X = make2d(_pruning_X) _pruning_y = make2d(_pruning_y) else: - raise ValueError("pruning_set parameter is expected to be " - "a tuple of pruning features and pruning " - "dependent variables") + raise ValueError( + "pruning_set parameter is expected to be " + "a tuple of pruning features and pruning " + "dependent variables" + ) if w is not None: w = make2d(np.asarray(w)) @@ -109,11 +114,11 @@ def _daal4py_fit(self, X, y, w, pruning_set=None): splitCriterion=self.split_criterion, maxTreeDepth=daal_max_tree_depth, minObservationsInLeafNodes=int(self.min_observations_in_leaf_node), - pruning=_pruning) - res = alg.compute(X, y, - dataForPruning=_pruning_X, - labelsForPruning=_pruning_y, - weights=w) + pruning=_pruning, + ) + res = alg.compute( + X, y, dataForPruning=_pruning_X, labelsForPruning=_pruning_y, weights=w + ) self.daal_model_ = res.model self._cached_tree_state_ = None @@ -122,7 +127,7 @@ def _get_tree_state(self): Internal utility that returns an array behind scikit-learn's tree object from daal_model_ produced by call to fit """ - check_is_fitted(self, ['daal_model_', '_cached_tree_state_']) + check_is_fitted(self, ["daal_model_", "_cached_tree_state_"]) if self._cached_tree_state_ is None: tree_state_class = d4p.getTreeState(self.daal_model_, int(self.n_classes_)) self._cached_tree_state_ = tree_state_class @@ -170,20 +175,26 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): onedal-documentation.html """ - if self.split_criterion not in ('gini', 'infoGain'): - raise ValueError('Parameter "split_criterion" must be ' - '"gini" or "infoGain".') + if self.split_criterion not in ("gini", "infoGain"): + raise ValueError( + 'Parameter "split_criterion" must be ' '"gini" or "infoGain".' + ) - if not isinstance(self.max_depth, numbers.Integral) or \ - self.max_depth < 0: + if not isinstance(self.max_depth, numbers.Integral) or self.max_depth < 0: if self.max_depth is not None: - raise ValueError('Parameter "max_depth" must be ' - 'a non-negative integer value or None.') - - if not isinstance(self.min_observations_in_leaf_node, numbers.Integral) or \ - self.min_observations_in_leaf_node <= 0: - raise ValueError('Parameter "min_observations_in_leaf_node" must be ' - 'non-zero positive integer value.') + raise ValueError( + 'Parameter "max_depth" must be ' + "a non-negative integer value or None." + ) + + if ( + not isinstance(self.min_observations_in_leaf_node, numbers.Integral) + or self.min_observations_in_leaf_node <= 0 + ): + raise ValueError( + 'Parameter "min_observations_in_leaf_node" must be ' + "non-zero positive integer value." + ) X = check_array(X, dtype=[np.single, np.double]) y = np.asarray(y) @@ -194,7 +205,8 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): "A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2 + DataConversionWarning, + stacklevel=2, ) check_consistent_length(X, y) @@ -207,9 +219,11 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): self.n_outputs_ = y.shape[1] if self.n_outputs_ != 1: _class_name = self.__class__.__name__ - raise ValueError(_class_name + " does not currently support " - "multi-output data. " - "Consider using OneHotEncoder") + raise ValueError( + _class_name + " does not currently support " + "multi-output data. " + "Consider using OneHotEncoder" + ) y = check_array(y, ensure_2d=False, dtype=None) check_classification_targets(y) @@ -221,8 +235,9 @@ def fit(self, X, y, sample_weight=None, pruning_set=None): y_store_unique_indices = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = \ - np.unique(y[:, k], return_inverse=True) + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices @@ -243,17 +258,16 @@ def _validate_X_predict(self, X, check_input): """Validate X whenever one tries to predict, apply, predict_proba""" if check_input: X = check_array(X, dtype=[np.single, np.double], accept_sparse="csr") - if issparse(X) and \ - (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): - raise ValueError("No support for np.int64 index based " - "sparse matrices") + if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): + raise ValueError("No support for np.int64 index based " "sparse matrices") n_features = X.shape[1] if self.n_features_ != n_features: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is %s and " - "input n_features is %s " - % (self.n_features_, n_features)) + raise ValueError( + "Number of features of the model must " + "match the input. Model n_features is %s and " + "input n_features is %s " % (self.n_features_, n_features) + ) return X @@ -264,21 +278,21 @@ def _daal4py_predict(self, X): method="defaultDense", nBins=1, nClasses=self.n_classes_, - resultsToEvaluate="computeClassLabels" + resultsToEvaluate="computeClassLabels", ) res = alg.compute(X, self.daal_model_) return res.prediction.ravel() @support_usm_ndarray() def predict(self, X, check_input=True): - check_is_fitted(self, 'daal_model_') + check_is_fitted(self, "daal_model_") X = self._validate_X_predict(X, check_input) y = self._daal4py_predict(X) return self.classes_.take(np.asarray(y, dtype=np.intp), axis=0) @support_usm_ndarray() def predict_proba(self, X, check_input=True): - check_is_fitted(self, 'daal_model_') + check_is_fitted(self, "daal_model_") X = self._validate_X_predict(X, check_input) y = self._daal4py_predict(X) return self.classes_.take(np.asarray(y, dtype=np.intp), axis=0) diff --git a/daal4py/sklearn/utils/__init__.py b/daal4py/sklearn/utils/__init__.py index 810c35da04..6b7cf7b664 100644 --- a/daal4py/sklearn/utils/__init__.py +++ b/daal4py/sklearn/utils/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,9 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .validation import _assert_all_finite -__all__ = ['_assert_all_finite', '_daal_check_array', '_daal_check_X_y', - '_daal_validate_data'] +__all__ = [ + "_assert_all_finite", + "_daal_check_array", + "_daal_check_X_y", + "_daal_validate_data", +] diff --git a/daal4py/sklearn/utils/base.py b/daal4py/sklearn/utils/base.py index e7718d8f87..586f137ee0 100644 --- a/daal4py/sklearn/utils/base.py +++ b/daal4py/sklearn/utils/base.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .validation import _daal_check_array, _daal_check_X_y -def _daal_validate_data(self, X, y=None, reset=True, - validate_separately=False, **check_params): +def _daal_validate_data( + self, X, y=None, reset=True, validate_separately=False, **check_params +): """Validate input data and set or check the `n_features_in_` attribute. Parameters @@ -49,7 +50,7 @@ def _daal_validate_data(self, X, y=None, reset=True, """ if y is None: - if self._get_tags()['requires_y']: + if self._get_tags()["requires_y"]: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." @@ -69,6 +70,6 @@ def _daal_validate_data(self, X, y=None, reset=True, X, y = _daal_check_X_y(X, y, **check_params) out = X, y - if check_params.get('ensure_2d', True): + if check_params.get("ensure_2d", True): self._check_n_features(X, reset=reset) return out diff --git a/daal4py/sklearn/utils/validation.py b/daal4py/sklearn/utils/validation.py index b9e6aa959a..a57aea6f18 100644 --- a/daal4py/sklearn/utils/validation.py +++ b/daal4py/sklearn/utils/validation.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2014 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,38 +12,55 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np -import daal4py as d4p -from sklearn import get_config as _get_config -from sklearn.utils.fixes import _object_dtype_isnan import warnings from contextlib import suppress + +import numpy as np import scipy.sparse as sp from numpy.core.numeric import ComplexWarning -from sklearn.utils.validation import (_num_samples, _ensure_no_complex_data, - _ensure_sparse_format, column_or_1d, - check_consistent_length) -from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from sklearn import get_config as _get_config from sklearn.utils.extmath import _safe_accumulator_op -from .._utils import (is_DataFrame, get_dtype, get_number_of_types, - sklearn_check_version, PatchingConditionsChain) +from sklearn.utils.fixes import _object_dtype_isnan +from sklearn.utils.validation import _assert_all_finite as _sklearn_assert_all_finite +from sklearn.utils.validation import ( + _ensure_no_complex_data, + _ensure_sparse_format, + _num_samples, + check_consistent_length, + column_or_1d, +) +import daal4py as d4p -def _assert_all_finite(X, allow_nan=False, msg_dtype=None, - estimator_name=None, input_name=""): - if _get_config()['assume_finite']: +from .._utils import ( + PatchingConditionsChain, + get_dtype, + get_number_of_types, + is_DataFrame, + sklearn_check_version, +) + + +def _assert_all_finite( + X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name="" +): + if _get_config()["assume_finite"]: return # Data with small size has too big relative overhead # TODO: tune threshold size - if hasattr(X, 'size'): + if hasattr(X, "size"): if X.size < 32768: if sklearn_check_version("1.1"): - _sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype, - estimator_name=estimator_name, - input_name=input_name) + _sklearn_assert_all_finite( + X, + allow_nan=allow_nan, + msg_dtype=msg_dtype, + estimator_name=estimator_name, + input_name=input_name, + ) else: _sklearn_assert_all_finite(X, allow_nan=allow_nan, msg_dtype=msg_dtype) return @@ -57,25 +74,28 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None, lst = [] for idx in X: arr = X[idx].to_numpy() - lst.append(arr if arr.flags['C_CONTIGUOUS'] else np.ascontiguousarray(arr)) + lst.append(arr if arr.flags["C_CONTIGUOUS"] else np.ascontiguousarray(arr)) else: X = np.asanyarray(X) is_df = False dt = np.dtype(get_dtype(X)) - is_float = dt.kind in 'fc' + is_float = dt.kind in "fc" msg_err = "Input {} contains {} or a value too large for {!r}." - type_err = 'infinity' if allow_nan else 'NaN, infinity' - err = msg_err.format( - input_name, type_err, msg_dtype if msg_dtype is not None else dt) + type_err = "infinity" if allow_nan else "NaN, infinity" + err = msg_err.format(input_name, type_err, msg_dtype if msg_dtype is not None else dt) _patching_status = PatchingConditionsChain( - 'sklearn.utils.validation._assert_all_finite') - _dal_ready = _patching_status.and_conditions([ - (X.ndim in [1, 2], "X has not 1 or 2 dimensions."), - (not np.any(np.equal(X.shape, 0)), "X shape contains 0."), - (dt in [np.float32, np.float64], "X dtype is not float32 or float64.")]) + "sklearn.utils.validation._assert_all_finite" + ) + _dal_ready = _patching_status.and_conditions( + [ + (X.ndim in [1, 2], "X has not 1 or 2 dimensions."), + (not np.any(np.equal(X.shape, 0)), "X shape contains 0."), + (dt in [np.float32, np.float64], "X dtype is not float32 or float64."), + ] + ) _patching_status.write_log() if _dal_ready: if X.ndim == 1: @@ -96,35 +116,43 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None, elif is_float and (np.isfinite(_safe_accumulator_op(np.sum, X))): pass elif is_float: - if allow_nan and np.isinf(X).any() or \ - not allow_nan and not np.isfinite(X).all(): + if allow_nan and np.isinf(X).any() or not allow_nan and not np.isfinite(X).all(): raise ValueError(err) # for object dtype data, we only check for NaNs (GH-13254) - elif dt == np.dtype('object') and not allow_nan: + elif dt == np.dtype("object") and not allow_nan: if _object_dtype_isnan(X).any(): raise ValueError(f"Input {input_name} contains NaN") -def _pandas_check_array(array, array_orig, force_all_finite, ensure_min_samples, - ensure_min_features, copy, context): +def _pandas_check_array( + array, + array_orig, + force_all_finite, + ensure_min_samples, + ensure_min_features, + copy, + context, +): if force_all_finite: - _assert_all_finite(array, allow_nan=force_all_finite == 'allow-nan') + _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: - raise ValueError("Found array with %d sample(s) (shape=%s) while a" - " minimum of %d is required%s." - % (n_samples, array.shape, ensure_min_samples, - context)) + raise ValueError( + "Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required%s." + % (n_samples, array.shape, ensure_min_samples, context) + ) if ensure_min_features > 0: n_features = array.shape[1] if n_features < ensure_min_features: - raise ValueError("Found array with %d feature(s) (shape=%s) while" - " a minimum of %d is required%s." - % (n_features, array.shape, ensure_min_features, - context)) + raise ValueError( + "Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required%s." + % (n_features, array.shape, ensure_min_features, context) + ) if copy and np.may_share_memory(array, array_orig): array = array.copy() @@ -132,11 +160,21 @@ def _pandas_check_array(array, array_orig, force_all_finite, ensure_min_samples, return array -def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, - dtype="numeric", order=None, copy=False, force_all_finite=True, - ensure_2d=True, allow_nd=False, ensure_min_samples=1, - ensure_min_features=1, estimator=None): - +def _daal_check_array( + array, + accept_sparse=False, + *, + accept_large_sparse=True, + dtype="numeric", + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + allow_nd=False, + ensure_min_samples=1, + ensure_min_features=1, + estimator=None, +): """Input validation on an array, list, sparse matrix or similar. By default, the input is checked to be a non-empty 2D array containing @@ -219,9 +257,11 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, array_converted : object The converted and validated array. """ - if force_all_finite not in (True, False, 'allow-nan'): - raise ValueError('force_all_finite should be a bool or "allow-nan"' - '. Got {!r} instead'.format(force_all_finite)) + if force_all_finite not in (True, False, "allow-nan"): + raise ValueError( + 'force_all_finite should be a bool or "allow-nan"' + ". Got {!r} instead".format(force_all_finite) + ) if estimator is not None: if isinstance(estimator, str): @@ -237,17 +277,23 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, # a branch for heterogeneous pandas.DataFrame if is_DataFrame(array) and get_number_of_types(array) > 1: from pandas.api.types import is_sparse - if hasattr(array, 'sparse') or \ - not array.dtypes.apply(is_sparse).any(): - return _pandas_check_array(array, array_orig, force_all_finite, - ensure_min_samples, ensure_min_features, - copy, context) + + if hasattr(array, "sparse") or not array.dtypes.apply(is_sparse).any(): + return _pandas_check_array( + array, + array_orig, + force_all_finite, + ensure_min_samples, + ensure_min_features, + copy, + context, + ) # store whether originally we wanted numeric dtype dtype_numeric = isinstance(dtype, str) and dtype == "numeric" dtype_orig = getattr(array, "dtype", None) - if not hasattr(dtype_orig, 'kind'): + if not hasattr(dtype_orig, "kind"): # not a data type (e.g. a column named dtype in a pandas DataFrame) dtype_orig = None @@ -255,13 +301,13 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, # DataFrame), and store them. If not, store None. dtypes_orig = None has_pd_integer_array = False - if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): + if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"): # throw warning if columns are sparse. If all columns are sparse, then # array.sparse exists and sparsity will be perserved (later). with suppress(ImportError): from pandas.api.types import is_sparse - if not hasattr(array, 'sparse') and \ - array.dtypes.apply(is_sparse).any(): + + if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any(): warnings.warn( "pandas.DataFrame with sparse columns found." "It will be converted to a dense numpy array." @@ -270,20 +316,36 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtypes_orig = list(array.dtypes) # pandas boolean dtype __array__ interface coerces bools to objects for i, dtype_iter in enumerate(dtypes_orig): - if dtype_iter.kind == 'b': + if dtype_iter.kind == "b": dtypes_orig[i] = np.dtype(np.object) elif dtype_iter.name.startswith(("Int", "UInt")): # name looks like an Integer Extension Array, now check for # the dtype with suppress(ImportError): - from pandas import (Int8Dtype, Int16Dtype, - Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype) - if isinstance(dtype_iter, (Int8Dtype, Int16Dtype, - Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype)): + from pandas import ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ) + + if isinstance( + dtype_iter, + ( + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + ), + ): has_pd_integer_array = True if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): @@ -310,16 +372,20 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, array = array.astype(dtype) # When all dataframe columns are sparse, convert to a sparse array - if hasattr(array, 'sparse') and array.ndim > 1: + if hasattr(array, "sparse") and array.ndim > 1: # DataFrame.sparse only supports `to_coo` array = array.sparse.to_coo() if sp.issparse(array): _ensure_no_complex_data(array) - array = _ensure_sparse_format(array, accept_sparse=accept_sparse, - dtype=dtype, copy=copy, - force_all_finite=force_all_finite, - accept_large_sparse=accept_large_sparse) + array = _ensure_sparse_format( + array, + accept_sparse=accept_sparse, + dtype=dtype, + copy=copy, + force_all_finite=force_all_finite, + accept_large_sparse=accept_large_sparse, + ) else: # If np.array(..) gives ComplexWarning, then we convert the warning # to an error. This is needed because specifying a non complex @@ -328,21 +394,19 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, # of warnings context manager. with warnings.catch_warnings(): try: - warnings.simplefilter('error', ComplexWarning) - if dtype is not None and np.dtype(dtype).kind in 'iu': + warnings.simplefilter("error", ComplexWarning) + if dtype is not None and np.dtype(dtype).kind in "iu": # Conversion float -> int should not contain NaN or # inf (numpy#14412). We cannot use casting='safe' because # then conversion float -> int would be disallowed. array = np.asarray(array, order=order) - if array.dtype.kind == 'f': - _assert_all_finite(array, allow_nan=False, - msg_dtype=dtype) + if array.dtype.kind == "f": + _assert_all_finite(array, allow_nan=False, msg_dtype=dtype) array = array.astype(dtype, casting="unsafe", copy=False) else: array = np.asarray(array, order=order, dtype=dtype) except ComplexWarning: - raise ValueError("Complex data not supported\n" - "{}\n".format(array)) + raise ValueError("Complex data not supported\n" "{}\n".format(array)) # It is possible that the np.array(..) gave no warning. This happens # when no dtype conversion happened, for example dtype = None. The @@ -357,14 +421,16 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, "Expected 2D array, got scalar array instead:\narray={}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " - "if it contains a single sample.".format(array)) + "if it contains a single sample.".format(array) + ) # If input is 1D raise error if array.ndim == 1: raise ValueError( "Expected 2D array, got 1D array instead:\narray={}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " - "if it contains a single sample.".format(array)) + "if it contains a single sample.".format(array) + ) # in the future np.flexible dtypes will be handled like object dtypes if dtype_numeric and np.issubdtype(array.dtype, np.flexible): @@ -375,33 +441,39 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, "a float dtype before using it in scikit-learn, " "for example by using " "your_array = your_array.astype(np.float64).", - FutureWarning, stacklevel=2) + FutureWarning, + stacklevel=2, + ) # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": array = array.astype(np.float64) if not allow_nd and array.ndim >= 3: - raise ValueError("Found array with dim %d. %s expected <= 2." - % (array.ndim, estimator_name)) + raise ValueError( + "Found array with dim %d. %s expected <= 2." + % (array.ndim, estimator_name) + ) if force_all_finite: - _assert_all_finite(array, allow_nan=force_all_finite == 'allow-nan') + _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: - raise ValueError("Found array with %d sample(s) (shape=%s) while a" - " minimum of %d is required%s." - % (n_samples, array.shape, ensure_min_samples, - context)) + raise ValueError( + "Found array with %d sample(s) (shape=%s) while a" + " minimum of %d is required%s." + % (n_samples, array.shape, ensure_min_samples, context) + ) if ensure_min_features > 0 and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: - raise ValueError("Found array with %d feature(s) (shape=%s) while" - " a minimum of %d is required%s." - % (n_features, array.shape, ensure_min_features, - context)) + raise ValueError( + "Found array with %d feature(s) (shape=%s) while" + " a minimum of %d is required%s." + % (n_features, array.shape, ensure_min_features, context) + ) if copy and np.may_share_memory(array, array_orig): array = np.array(array, dtype=dtype, order=order) @@ -409,11 +481,24 @@ def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, return array -def _daal_check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, - dtype="numeric", order=None, copy=False, force_all_finite=True, - ensure_2d=True, allow_nd=False, multi_output=False, - ensure_min_samples=1, ensure_min_features=1, y_numeric=False, - estimator=None): +def _daal_check_X_y( + X, + y, + accept_sparse=False, + *, + accept_large_sparse=True, + dtype="numeric", + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + allow_nd=False, + multi_output=False, + ensure_min_samples=1, + ensure_min_features=1, + y_numeric=False, + estimator=None, +): """Input validation for standard estimators. Checks X and y for consistent length, enforces X to be 2D and y 1D. By @@ -516,22 +601,27 @@ def _daal_check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, raise ValueError("y cannot be None") X = _daal_check_array( - X, accept_sparse=accept_sparse, + X, + accept_sparse=accept_sparse, accept_large_sparse=accept_large_sparse, - dtype=dtype, order=order, copy=copy, + dtype=dtype, + order=order, + copy=copy, force_all_finite=force_all_finite, - ensure_2d=ensure_2d, allow_nd=allow_nd, + ensure_2d=ensure_2d, + allow_nd=allow_nd, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, - estimator=estimator + estimator=estimator, ) if multi_output: - y = _daal_check_array(y, accept_sparse='csr', force_all_finite=True, - ensure_2d=False, dtype=None) + y = _daal_check_array( + y, accept_sparse="csr", force_all_finite=True, ensure_2d=False, dtype=None + ) else: y = column_or_1d(y, warn=True) _assert_all_finite(y) - if y_numeric and hasattr(y, 'dtype') and y.dtype.kind == 'O': + if y_numeric and hasattr(y, "dtype") and y.dtype.kind == "O": y = y.astype(np.float64) check_consistent_length(X, y) diff --git a/onedal/__init__.py b/onedal/__init__.py index 3a425a8c0a..5704f42461 100644 --- a/onedal/__init__.py +++ b/onedal/__init__.py @@ -15,45 +15,52 @@ # =============================================================================== import platform + from daal4py.sklearn._utils import daal_check_version if "Windows" in platform.system(): import os - import sys import site + import sys + path_to_env = site.getsitepackages()[0] path_to_libs = os.path.join(path_to_env, "Library", "bin") if sys.version_info.minor >= 8: - if 'DALROOT' in os.environ: - dal_root_redist = os.path.join( - os.environ['DALROOT'], "redist", "intel64") + if "DALROOT" in os.environ: + dal_root_redist = os.path.join(os.environ["DALROOT"], "redist", "intel64") if os.path.exists(dal_root_redist): os.add_dll_directory(dal_root_redist) os.add_dll_directory(path_to_libs) - os.environ['PATH'] = path_to_libs + os.pathsep + os.environ['PATH'] + os.environ["PATH"] = path_to_libs + os.pathsep + os.environ["PATH"] try: import onedal._onedal_py_dpc as _backend + _is_dpc_backend = True except ImportError: import onedal._onedal_py_host as _backend + _is_dpc_backend = False -__all__ = ['decomposition', 'ensemble', 'neighbors', 'primitives', 'svm'] +__all__ = ["decomposition", "ensemble", "neighbors", "primitives", "svm"] if _is_dpc_backend: - __all__.append('spmd') + __all__.append("spmd") -if daal_check_version((2023, 'P', 100)): - __all__ += ['basic_statistics', 'linear_model'] +if daal_check_version((2023, "P", 100)): + __all__ += ["basic_statistics", "linear_model"] if _is_dpc_backend: - __all__ += ['spmd.basic_statistics', 'spmd.decomposition', - 'spmd.linear_model', 'spmd.neighbors'] + __all__ += [ + "spmd.basic_statistics", + "spmd.decomposition", + "spmd.linear_model", + "spmd.neighbors", + ] -if daal_check_version((2023, 'P', 200)): - __all__ += ['cluster'] +if daal_check_version((2023, "P", 200)): + __all__ += ["cluster"] if _is_dpc_backend: - __all__ += ['spmd.cluster'] + __all__ += ["spmd.cluster"] diff --git a/onedal/_device_offload.py b/onedal/_device_offload.py index 09cd48f681..6ff1990ebe 100644 --- a/onedal/_device_offload.py +++ b/onedal/_device_offload.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from functools import wraps try: - from sklearnex._device_offload import (_get_global_queue, - _transfer_to_host, - _copy_to_usm) + from sklearnex._device_offload import ( + _copy_to_usm, + _get_global_queue, + _transfer_to_host, + ) + _sklearnex_available = True except ImportError: import logging - logging.warning('Device support requires ' - 'Intel(R) Extension for Scikit-learn*.') + + logging.warning("Device support requires " "Intel(R) Extension for Scikit-learn*.") _sklearnex_available = False @@ -40,9 +43,7 @@ def _extract_usm_iface(*args, **kwargs): allargs = (*args, *kwargs.values()) if len(allargs) == 0: return None - return getattr(allargs[0], - '__sycl_usm_array_interface__', - None) + return getattr(allargs[0], "__sycl_usm_array_interface__", None) def _run_on_device(func, obj=None, *args, **kwargs): @@ -57,21 +58,25 @@ def wrapper_impl(obj, *args, **kwargs): if _sklearnex_available: usm_iface = _extract_usm_iface(*args, **kwargs) data_queue, hostargs, hostkwargs = _get_host_inputs(*args, **kwargs) - hostkwargs['queue'] = data_queue + hostkwargs["queue"] = data_queue result = _run_on_device(func, obj, *hostargs, **hostkwargs) - if usm_iface is not None and hasattr(result, '__array_interface__'): + if usm_iface is not None and hasattr(result, "__array_interface__"): return _copy_to_usm(data_queue, result) return result return _run_on_device(func, obj, *args, **kwargs) if freefunc: + @wraps(func) def wrapper_free(*args, **kwargs): return wrapper_impl(None, *args, **kwargs) + return wrapper_free @wraps(func) def wrapper_with_self(self, *args, **kwargs): return wrapper_impl(self, *args, **kwargs) + return wrapper_with_self + return decorator diff --git a/onedal/basic_statistics/__init__.py b/onedal/basic_statistics/__init__.py index 6f45ecfe5c..2b99fdbdb7 100644 --- a/onedal/basic_statistics/__init__.py +++ b/onedal/basic_statistics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/onedal/basic_statistics/basic_statistics.py b/onedal/basic_statistics/basic_statistics.py index 09f9c982e1..b048f9c02f 100644 --- a/onedal/basic_statistics/basic_statistics.py +++ b/onedal/basic_statistics/basic_statistics.py @@ -14,20 +14,17 @@ # limitations under the License. # =============================================================================== -from sklearn.base import BaseEstimator from abc import ABCMeta, abstractmethod - -import numpy as np from numbers import Number -from ..common._policy import _get_policy +import numpy as np +from sklearn.base import BaseEstimator -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) from onedal import _backend +from ..common._policy import _get_policy +from ..datatypes import _convert_to_supported, from_table, to_table + class BaseBasicStatistics(metaclass=ABCMeta): @abstractmethod @@ -37,10 +34,18 @@ def __init__(self, result_options, algorithm): @staticmethod def get_all_result_options(): - return ["min", "max", "sum", "mean", - "variance", "variation", "sum_squares", - "standard_deviation", "sum_squares_centered", - "second_order_raw_moment"] + return [ + "min", + "max", + "sum", + "mean", + "variance", + "variation", + "sum_squares", + "standard_deviation", + "sum_squares_centered", + "second_order_raw_moment", + ] def _get_policy(self, queue, *data): return _get_policy(queue, *data) @@ -56,8 +61,9 @@ def _get_result_options(self, options): def _get_onedal_params(self, dtype=np.float32): options = self._get_result_options(self.options) return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'method': self.algorithm, 'result_option': options, + "fptype": "float" if dtype == np.float32 else "double", + "method": self.algorithm, + "result_option": options, } def _compute_raw(self, data_table, weights_table, module, policy, dtype=np.float32): @@ -78,14 +84,12 @@ def _compute(self, data, weights, module, queue): if not (weights is None): weights = np.asarray(weights) - data, weights = _convert_to_supported( - policy, data, weights) + data, weights = _convert_to_supported(policy, data, weights) data_table, weights_table = to_table(data, weights) dtype = data.dtype - res = self._compute_raw(data_table, weights_table, - module, policy, dtype) + res = self._compute_raw(data_table, weights_table, module, policy, dtype) return {k: from_table(v).ravel() for k, v in res.items()} @@ -95,17 +99,13 @@ class BasicStatistics(BaseBasicStatistics): Basic Statistics oneDAL implementation. """ - def __init__( - self, - result_options="all", - *, - algorithm="by_default", - **kwargs): + def __init__(self, result_options="all", *, algorithm="by_default", **kwargs): super().__init__(result_options, algorithm) def compute(self, data, weights=None, queue=None): return super()._compute(data, weights, _backend.basic_statistics.compute, queue) def compute_raw(self, data_table, weights_table, policy, dtype=np.float32): - return super()._compute_raw(data_table, weights_table, - _backend.basic_statistics.compute, policy, dtype) + return super()._compute_raw( + data_table, weights_table, _backend.basic_statistics.compute, policy, dtype + ) diff --git a/onedal/basic_statistics/tests/test_basic_statistics.py b/onedal/basic_statistics/tests/test_basic_statistics.py index 0ca4670d49..01bd7b54ac 100644 --- a/onedal/basic_statistics/tests/test_basic_statistics.py +++ b/onedal/basic_statistics/tests/test_basic_statistics.py @@ -16,9 +16,9 @@ from daal4py.sklearn._utils import daal_check_version, sklearn_check_version -if daal_check_version((2023, 'P', 100)): - import pytest +if daal_check_version((2023, "P", 100)): import numpy as np + import pytest from numpy.testing import assert_allclose from onedal.basic_statistics import BasicStatistics @@ -29,18 +29,17 @@ ("min", np.min, (1e-5, 1e-7)), ("max", np.max, (1e-5, 1e-7)), ("mean", np.mean, (1e-5, 1e-7)), - ("standard_deviation", np.std, (3e-5, 3e-5)) + ("standard_deviation", np.std, (3e-5, 3e-5)), ] - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_basic_uniform(queue, dtype): seed = 42 s_count, f_count = 70000, 29 gen = np.random.default_rng(seed) - data = gen.uniform(low=-0.5, high=+0.6, - size=(s_count, f_count)) + data = gen.uniform(low=-0.5, high=+0.6, size=(s_count, f_count)) data = data.astype(dtype=dtype) alg = BasicStatistics(result_options="mean") @@ -51,9 +50,9 @@ def test_basic_uniform(queue, dtype): tol = 2e-5 if res_mean.dtype == np.float32 else 1e-7 assert_allclose(gtr_mean, res_mean, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('option', options_and_tests) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("option", options_and_tests) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_option_uniform(queue, option, dtype): seed = 77 s_count, f_count = 19999, 31 @@ -62,8 +61,7 @@ def test_option_uniform(queue, option, dtype): fp32tol, fp64tol = tols gen = np.random.default_rng(seed) - data = gen.uniform(low=-0.3, high=+0.7, - size=(s_count, f_count)) + data = gen.uniform(low=-0.3, high=+0.7, size=(s_count, f_count)) data = data.astype(dtype=dtype) alg = BasicStatistics(result_options=result_option) @@ -74,9 +72,9 @@ def test_option_uniform(queue, option, dtype): tol = fp32tol if res.dtype == np.float32 else fp64tol assert_allclose(gtr, res, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('option', options_and_tests) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("option", options_and_tests) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_option_weighted(queue, option, dtype): seed = 999 s_count, f_count = 1024, 127 @@ -86,10 +84,8 @@ def test_option_weighted(queue, option, dtype): fp32tol, fp64tol = 30 * fp32tol, 50 * fp64tol gen = np.random.default_rng(seed) - data = gen.uniform(low=-5.0, high=+9.0, - size=(s_count, f_count)) - weights = gen.uniform(low=-0.5, high=+1.0, - size=s_count) + data = gen.uniform(low=-5.0, high=+9.0, size=(s_count, f_count)) + weights = gen.uniform(low=-0.5, high=+1.0, size=s_count) data = data.astype(dtype=dtype) weights = weights.astype(dtype=dtype) diff --git a/onedal/cluster/__init__.py b/onedal/cluster/__init__.py index 609f7670b3..d8e38f9632 100644 --- a/onedal/cluster/__init__.py +++ b/onedal/cluster/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import daal_check_version from .kmeans import KMeans, k_means -__all__ = ['KMeans', 'k_means'] +__all__ = ["KMeans", "k_means"] -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit, kmeans_plusplus - __all__ += ['KMeansInit', 'kmeans_plusplus'] + + __all__ += ["KMeansInit", "kmeans_plusplus"] diff --git a/onedal/cluster/kmeans.py b/onedal/cluster/kmeans.py index 60120f5b56..1519d88e23 100644 --- a/onedal/cluster/kmeans.py +++ b/onedal/cluster/kmeans.py @@ -15,55 +15,45 @@ # =============================================================================== import warnings +from abc import ABC import numpy as np +from daal4py.sklearn._utils import daal_check_version, get_dtype from onedal import _backend -from abc import ABC - -from daal4py.sklearn._utils import get_dtype -from daal4py.sklearn._utils import daal_check_version - -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) +from ..datatypes import _convert_to_supported, from_table, to_table -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans_init import KMeansInit else: from sklearn.cluster import _kmeans_plusplus -from onedal.basic_statistics import BasicStatistics - -from ..common._policy import _get_policy -from ..utils import _is_arraylike_not_scalar - +from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin from sklearn.exceptions import ConvergenceWarning +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils import check_array, check_random_state from sklearn.utils.validation import check_is_fitted -from sklearn.utils import check_random_state, check_array -from sklearn.base import ( - BaseEstimator, - ClusterMixin, - TransformerMixin) +from onedal.basic_statistics import BasicStatistics -from sklearn.metrics.pairwise import euclidean_distances +from ..common._policy import _get_policy +from ..utils import _is_arraylike_not_scalar class _BaseKMeans(TransformerMixin, ClusterMixin, BaseEstimator, ABC): def __init__( - self, - n_clusters, - *, - init, - n_init, - max_iter, - tol, - verbose, - random_state, - n_local_trials=None): + self, + n_clusters, + *, + init, + n_init, + max_iter, + tol, + verbose, + random_state, + n_local_trials=None, + ): self.n_clusters = n_clusters self.init = init self.max_iter = max_iter @@ -98,11 +88,8 @@ def _tolerance(self, rtol, X_table, policy, dtype=np.float32): return mean_var * rtol def _check_params_vs_input( - self, - X_table, - policy, - default_n_init=10, - dtype=np.float32): + self, X_table, policy, default_n_init=10, dtype=np.float32 + ): # n_clusters if X_table.shape[0] < self.n_clusters: raise ValueError( @@ -155,11 +142,12 @@ def _get_policy(self, queue, *data): def _get_onedal_params(self, dtype=np.float32): thr = self._tol if hasattr(self, "_tol") else self.tol return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'method': 'by_default', 'seed': -1, - 'max_iteration_count': self.max_iter, - 'cluster_count': self.n_clusters, - 'accuracy_threshold': thr, + "fptype": "float" if dtype == np.float32 else "double", + "method": "by_default", + "seed": -1, + "max_iteration_count": self.max_iter, + "cluster_count": self.n_clusters, + "accuracy_threshold": thr, } def _get_params_and_input(self, X, policy): @@ -180,26 +168,19 @@ def _get_params_and_input(self, X, policy): return (params, X_table, dtype) def _init_centroids_custom( - self, - X_table, - init, - random_seed, - policy, - dtype=np.float32, - n_centroids=None): + self, X_table, init, random_seed, policy, dtype=np.float32, n_centroids=None + ): n_clusters = self.n_clusters if n_centroids is None else n_centroids if isinstance(init, str) and init == "k-means++": alg = KMeansInit( - cluster_count=n_clusters, - seed=random_seed, - algorithm="plus_plus_dense") + cluster_count=n_clusters, seed=random_seed, algorithm="plus_plus_dense" + ) centers_table = alg.compute_raw(X_table, policy, dtype) elif isinstance(init, str) and init == "random": alg = KMeansInit( - cluster_count=n_clusters, - seed=random_seed, - algorithm="random_dense") + cluster_count=n_clusters, seed=random_seed, algorithm="random_dense" + ) centers_table = alg.compute_raw(X_table, policy, dtype) elif _is_arraylike_not_scalar(init): centers = np.asarray(init) @@ -222,11 +203,7 @@ def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float3 random_state=random_state, ) elif isinstance(init, str) and init == "random": - seeds = random_state.choice( - n_samples, - size=self.n_clusters, - replace=False - ) + seeds = random_state.choice(n_samples, size=self.n_clusters, replace=False) centers = X[seeds] elif callable(init): cc_arr = init(X, self.n_clusters, random_state) @@ -238,7 +215,8 @@ def _init_centroids_generic(self, X, init, random_state, policy, dtype=np.float3 else: raise ValueError( f"init should be either 'k-means++', 'random', a ndarray or a " - f"callable, got '{ init }' instead.") + f"callable, got '{ init }' instead." + ) centers = _convert_to_supported(policy, centers) return to_table(centers) @@ -252,8 +230,12 @@ def _fit_backend(self, X_table, centroids_table, module, policy, dtype=np.float3 result = module.train(policy, params, X_table, centroids_table) - return (result.responses, result.objective_function_value, - result.model, result.iteration_count) + return ( + result.responses, + result.objective_function_value, + result.model, + result.iteration_count, + ) def _fit(self, X, module, queue=None): policy = self._get_policy(queue, X) @@ -271,7 +253,8 @@ def is_better_iteration(inertia, labels): mod = _backend.kmeans_common better_inertia = inertia < best_inertia same_clusters = mod._is_same_clustering( - labels, best_labels, self.n_clusters) + labels, best_labels, self.n_clusters + ) return better_inertia and not same_clusters random_state = check_random_state(self.random_state) @@ -282,12 +265,12 @@ def is_better_iteration(inertia, labels): init = check_array(init, dtype=dtype, copy=True, order="C") self._validate_center_shape(X, init) - use_custom_init = daal_check_version((2023, 'P', 200)) and not callable(self.init) + use_custom_init = daal_check_version((2023, "P", 200)) and not callable(self.init) for _ in range(self._n_init): if use_custom_init: - #random_seed = random_state.tomaxint() - random_seed = random_state.randint(np.iinfo('i').max) + # random_seed = random_state.tomaxint() + random_seed = random_state.randint(np.iinfo("i").max) centroids_table = self._init_centroids_custom( X_table, init, random_seed, policy, dtype=dtype ) @@ -304,9 +287,7 @@ def is_better_iteration(inertia, labels): ) if self.verbose: - print("KMeans iteration completed with " - "inertia {}.".format(inertia) - ) + print("KMeans iteration completed with " "inertia {}.".format(inertia)) if is_better_iteration(inertia, labels): best_model, best_n_iter = model, n_iter @@ -498,7 +479,7 @@ def k_means( copy_x=True, algorithm="lloyd", return_n_iter=False, - queue=None + queue=None, ): est = KMeans( n_clusters=n_clusters, diff --git a/onedal/cluster/kmeans_init.py b/onedal/cluster/kmeans_init.py index cb3188a162..3f5a2b65b5 100755 --- a/onedal/cluster/kmeans_init.py +++ b/onedal/cluster/kmeans_init.py @@ -15,32 +15,28 @@ # =============================================================================== import numpy as np +from sklearn.utils import check_random_state +from daal4py.sklearn._utils import daal_check_version, get_dtype from onedal import _backend -from daal4py.sklearn._utils import get_dtype -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) - from ..common._policy import _get_policy +from ..datatypes import _convert_to_supported, from_table, to_table -from sklearn.utils import check_random_state - -from daal4py.sklearn._utils import daal_check_version +if daal_check_version((2023, "P", 200)): -if daal_check_version((2023, 'P', 200)): class KMeansInit: """ KMeansInit oneDAL implementation. """ - def __init__(self, - cluster_count, - seed=777, - local_trials_count=None, - algorithm='plus_plus_dense'): + def __init__( + self, + cluster_count, + seed=777, + local_trials_count=None, + algorithm="plus_plus_dense", + ): self.cluster_count = cluster_count self.seed = seed self.local_trials_count = local_trials_count @@ -56,10 +52,11 @@ def _get_policy(self, queue, *data): def _get_onedal_params(self, dtype=np.float32): return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'local_trials_count': self.local_trials_count, - 'method': self.algorithm, 'seed': self.seed, - 'cluster_count': self.cluster_count, + "fptype": "float" if dtype == np.float32 else "double", + "local_trials_count": self.local_trials_count, + "method": self.algorithm, + "seed": self.seed, + "cluster_count": self.cluster_count, } def _get_params_and_input(self, X, policy): @@ -96,16 +93,18 @@ def compute(self, X, queue=None): return self._compute(X, _backend.kmeans_init.init, queue) def kmeans_plusplus( - X, - n_clusters, - *, - x_squared_norms=None, - random_state=None, - n_local_trials=None, - queue=None): + X, + n_clusters, + *, + x_squared_norms=None, + random_state=None, + n_local_trials=None, + queue=None, + ): random_seed = check_random_state(random_state).tomaxint() return ( KMeansInit( - n_clusters, seed=random_seed, local_trials_count=n_local_trials).compute( - X, queue), np.full( - n_clusters, -1)) + n_clusters, seed=random_seed, local_trials_count=n_local_trials + ).compute(X, queue), + np.full(n_clusters, -1), + ) diff --git a/onedal/cluster/tests/test_kmeans.py b/onedal/cluster/tests/test_kmeans.py index 61a40962a0..ac3f305353 100644 --- a/onedal/cluster/tests/test_kmeans.py +++ b/onedal/cluster/tests/test_kmeans.py @@ -14,21 +14,20 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np - +import pytest from numpy.testing import assert_array_equal + from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): + from sklearn.cluster import kmeans_plusplus as init_external + from sklearn.neighbors import NearestNeighbors + from onedal.cluster import KMeans from onedal.cluster import kmeans_plusplus as init_internal from onedal.tests.utils._device_selection import get_queues - from sklearn.cluster import kmeans_plusplus as init_external - - from sklearn.neighbors import NearestNeighbors - def generate_dataset(n_dim, n_cluster, n_points=None, seed=777, dtype=np.float32): # We need some reference value of points for each cluster n_points = (n_dim * n_cluster) if n_points is None else n_points @@ -46,7 +45,7 @@ def generate_dataset(n_dim, n_cluster, n_points=None, seed=777, dtype=np.float32 # Generating dataset def gen_one(c): - params = {'loc': cs[c, :], 'scale': vs[c], 'size': (n_points, n_dim)} + params = {"loc": cs[c, :], "scale": vs[c], "size": (n_points, n_dim)} return gen.normal(**params) data = [gen_one(c) for c in range(n_cluster)] @@ -57,23 +56,23 @@ def gen_one(c): return (cs, vs, data) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) - @pytest.mark.parametrize('n_dim', [3, 4, 17, 24]) - @pytest.mark.parametrize('n_cluster', [9, 11, 32]) - @pytest.mark.parametrize('pipeline', ['implicit', 'external', 'internal']) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("n_dim", [3, 4, 17, 24]) + @pytest.mark.parametrize("n_cluster", [9, 11, 32]) + @pytest.mark.parametrize("pipeline", ["implicit", "external", "internal"]) def test_generated_dataset(queue, dtype, n_dim, n_cluster, pipeline): seed = 777 * n_dim * n_cluster cs, vs, X = generate_dataset(n_dim, n_cluster, seed=seed, dtype=dtype) - if pipeline == 'external': + if pipeline == "external": init_data, _ = init_external(X, n_cluster) m = KMeans(n_cluster, init=init_data, max_iter=5) - elif pipeline == 'internal': + elif pipeline == "internal": init_data, _ = init_internal(X, n_cluster, queue=queue) m = KMeans(n_cluster, init=init_data, max_iter=5) else: - m = KMeans(n_cluster, init='k-means++', max_iter=5) + m = KMeans(n_cluster, init="k-means++", max_iter=5) m.fit(X, queue=queue) diff --git a/onedal/cluster/tests/test_kmeans_init.py b/onedal/cluster/tests/test_kmeans_init.py index 6d92ab9c44..932918aa53 100755 --- a/onedal/cluster/tests/test_kmeans_init.py +++ b/onedal/cluster/tests/test_kmeans_init.py @@ -14,22 +14,22 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np - +import pytest from numpy.testing import assert_array_equal -from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): - from onedal.cluster import kmeans_plusplus, KMeans - from onedal.tests.utils._device_selection import get_queues +from daal4py.sklearn._utils import daal_check_version +if daal_check_version((2023, "P", 200)): from sklearn.datasets import load_breast_cancer from sklearn.metrics import davies_bouldin_score - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) - @pytest.mark.parametrize('n_cluster', [2, 5, 11, 128]) + from onedal.cluster import KMeans, kmeans_plusplus + from onedal.tests.utils._device_selection import get_queues + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("n_cluster", [2, 5, 11, 128]) def test_breast_cancer(queue, dtype, n_cluster): X, _ = load_breast_cancer(return_X_y=True) X = np.asarray(X).astype(dtype=dtype) @@ -58,7 +58,7 @@ def generate_dataset(n_dim, n_cluster, n_points=None, seed=777, dtype=np.float32 # Generating dataset def gen_one(c): - params = {'loc': cs[c, :], 'scale': vs[c], 'size': (n_points, n_dim)} + params = {"loc": cs[c, :], "scale": vs[c], "size": (n_points, n_dim)} return gen.normal(**params) data = [gen_one(c) for c in range(n_cluster)] @@ -69,10 +69,10 @@ def gen_one(c): return (cs, vs, data) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) - @pytest.mark.parametrize('n_dim', [3, 12, 17]) - @pytest.mark.parametrize('n_cluster', [2, 15, 61]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) + @pytest.mark.parametrize("n_dim", [3, 12, 17]) + @pytest.mark.parametrize("n_cluster", [2, 15, 61]) def test_generated_dataset(queue, dtype, n_dim, n_cluster): seed = 777 * n_dim * n_cluster cs, vs, X = generate_dataset(n_dim, n_cluster, seed=seed, dtype=dtype) diff --git a/onedal/common/_estimator_checks.py b/onedal/common/_estimator_checks.py index 034b724b65..e42efea8e4 100755 --- a/onedal/common/_estimator_checks.py +++ b/onedal/common/_estimator_checks.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== def _check_is_fitted(estimator, attributes=None, *, msg=None): if msg is None: - msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this estimator.") + msg = ( + "This %(name)s instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator." + ) - if not hasattr(estimator, 'fit'): + if not hasattr(estimator, "fit"): raise TypeError("%s is not an estimator instance." % (estimator)) if attributes is not None: @@ -28,11 +30,10 @@ def _check_is_fitted(estimator, attributes=None, *, msg=None): attributes = [attributes] attrs = all([hasattr(estimator, attr) for attr in attributes]) else: - attrs = [v for v in vars(estimator) - if v.endswith("_") and not v.startswith("__")] + attrs = [v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")] if not attrs: - raise AttributeError(msg % {'name': type(estimator).__name__}) + raise AttributeError(msg % {"name": type(estimator).__name__}) def _is_classifier(estimator): diff --git a/onedal/common/_mixin.py b/onedal/common/_mixin.py index 9b1adeb819..94efb1daf6 100644 --- a/onedal/common/_mixin.py +++ b/onedal/common/_mixin.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + class ClassifierMixin: _estimator_type = "classifier" def score(self, X, y, sample_weight=None, queue=None): from sklearn.metrics import accuracy_score - return accuracy_score(y, self.predict(X, queue=queue), - sample_weight=sample_weight) + + return accuracy_score( + y, self.predict(X, queue=queue), sample_weight=sample_weight + ) def _more_tags(self): return {"requires_y": True} @@ -31,6 +34,7 @@ class RegressorMixin: def score(self, X, y, sample_weight=None, queue=None): from sklearn.metrics import r2_score + return r2_score(y, self.predict(X, queue=queue), sample_weight=sample_weight) def _more_tags(self): diff --git a/onedal/common/_policy.py b/onedal/common/_policy.py index 49dc4863a5..3de7f769d7 100644 --- a/onedal/common/_policy.py +++ b/onedal/common/_policy.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from onedal import _backend, _is_dpc_backend import sys -oneapi_is_available = 'daal4py.oneapi' in sys.modules +from onedal import _backend, _is_dpc_backend + +oneapi_is_available = "daal4py.oneapi" in sys.modules if oneapi_is_available: from daal4py.oneapi import _get_sycl_ctxt, sycl_execution_context @@ -35,9 +36,9 @@ def _get_policy(queue, *data): def _get_queue(*data): - if len(data) > 0 and hasattr(data[0], '__sycl_usm_array_interface__'): + if len(data) > 0 and hasattr(data[0], "__sycl_usm_array_interface__"): # Assume that all data reside on the same device - return data[0].__sycl_usm_array_interface__['syclobj'] + return data[0].__sycl_usm_array_interface__["syclobj"] return None @@ -47,7 +48,7 @@ def __init__(self): self._host_context = None if oneapi_is_available: self._d4p_context = _get_sycl_ctxt() - self._host_context = sycl_execution_context('cpu') + self._host_context = sycl_execution_context("cpu") self._host_context.apply() def __del__(self): @@ -62,12 +63,14 @@ def __init__(self): if _is_dpc_backend: + class _DataParallelInteropPolicy(_backend.data_parallel_policy): def __init__(self, queue): self._queue = queue self._d4p_interop = _Daal4PyContextReset() - if 'sklearnex' in sys.modules: + if "sklearnex" in sys.modules: from sklearnex._device_offload import DummySyclQueue + if isinstance(queue, DummySyclQueue): super().__init__(self._queue.sycl_device.get_filter_string()) return diff --git a/onedal/common/_spmd_policy.py b/onedal/common/_spmd_policy.py index f829b04363..daea02fc63 100644 --- a/onedal/common/_spmd_policy.py +++ b/onedal/common/_spmd_policy.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from onedal import _backend, _is_dpc_backend import sys +from onedal import _backend, _is_dpc_backend + if _is_dpc_backend: + class _SPMDDataParallelInteropPolicy(_backend.spmd_data_parallel_policy): def __init__(self, queue): self._queue = queue diff --git a/onedal/common/tests/test_policy.py b/onedal/common/tests/test_policy.py index 63783c550d..05350051ca 100644 --- a/onedal/common/tests/test_policy.py +++ b/onedal/common/tests/test_policy.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,23 +12,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np +import pytest from onedal.common._policy import _get_policy from onedal.tests.utils._device_selection import ( - get_queues, get_memory_usm, is_dpctl_available, device_type_to_str) + device_type_to_str, + get_memory_usm, + get_queues, + is_dpctl_available, +) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_queue_passed_directly(queue): device_name = device_type_to_str(queue) assert _get_policy(queue).get_device_name() == device_name -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_with_numpy_data(queue): X = np.zeros((5, 3)) y = np.zeros(3) @@ -37,27 +41,28 @@ def test_with_numpy_data(queue): assert _get_policy(queue, X, y).get_device_name() == device_name -@pytest.mark.skipif(not is_dpctl_available(), reason='depends on dpctl') -@pytest.mark.parametrize('queue', get_queues('cpu,gpu')) -@pytest.mark.parametrize('memtype', get_memory_usm()) +@pytest.mark.skipif(not is_dpctl_available(), reason="depends on dpctl") +@pytest.mark.parametrize("queue", get_queues("cpu,gpu")) +@pytest.mark.parametrize("memtype", get_memory_usm()) def test_with_usm_ndarray_data(queue, memtype): from dpctl.tensor import usm_ndarray device_name = device_type_to_str(queue) X = usm_ndarray((5, 3), buffer=memtype(5 * 3 * 8, queue=queue)) - y = usm_ndarray((3, ), buffer=memtype(3 * 8, queue=queue)) + y = usm_ndarray((3,), buffer=memtype(3 * 8, queue=queue)) assert _get_policy(None, X, y).get_device_name() == device_name -@pytest.mark.skipif(not is_dpctl_available(['cpu', 'gpu']), - reason='test uses multiple devices') -@pytest.mark.parametrize('memtype', get_memory_usm()) +@pytest.mark.skipif( + not is_dpctl_available(["cpu", "gpu"]), reason="test uses multiple devices" +) +@pytest.mark.parametrize("memtype", get_memory_usm()) def test_queue_parameter_with_usm_ndarray(memtype): from dpctl import SyclQueue from dpctl.tensor import usm_ndarray - q1 = SyclQueue('cpu') - q2 = SyclQueue('gpu') + q1 = SyclQueue("cpu") + q2 = SyclQueue("gpu") X = usm_ndarray((5, 3), buffer=memtype(5 * 3 * 8, queue=q1)) assert _get_policy(q2, X).get_device_name() == device_type_to_str(q2) diff --git a/onedal/datatypes/__init__.py b/onedal/datatypes/__init__.py index 58aab9900a..470fda902e 100644 --- a/onedal/datatypes/__init__.py +++ b/onedal/datatypes/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ._data_conversion import ( - from_table, - to_table, - _convert_to_supported -) +from ._data_conversion import _convert_to_supported, from_table, to_table -__all__ = ['from_table', 'to_table', '_convert_to_supported'] +__all__ = ["from_table", "to_table", "_convert_to_supported"] diff --git a/onedal/datatypes/_data_conversion.py b/onedal/datatypes/_data_conversion.py index 4e3b0f9cc4..ec5ffc5c08 100644 --- a/onedal/datatypes/_data_conversion.py +++ b/onedal/datatypes/_data_conversion.py @@ -14,17 +14,18 @@ # limitations under the License. # =============================================================================== -import numpy as np import warnings -from onedal import _is_dpc_backend -from onedal import _backend +import numpy as np + from daal4py.sklearn._utils import make2d +from onedal import _backend, _is_dpc_backend try: import dpctl import dpctl.tensor as dpt - dpctl_available = dpctl.__version__ >= '0.14' + + dpctl_available = dpctl.__version__ >= "0.14" except ImportError: dpctl_available = False @@ -67,9 +68,11 @@ def func(x): def convert_or_pass(x): if (x is not None) and (x.dtype == np.float64): - warnings.warn("Data will be converted into float32 from " - "float64 because device does not support it", - RuntimeWarning, ) + warnings.warn( + "Data will be converted into float32 from " + "float64 because device does not support it", + RuntimeWarning, + ) return x.astype(np.float32) else: return x @@ -80,6 +83,7 @@ def convert_or_pass(x): return _apply_and_pass(func, *data) else: + def _convert_to_supported(policy, *data): def func(x): return x diff --git a/onedal/datatypes/tests/test_data.py b/onedal/datatypes/tests/test_data.py index 09ef20b77b..9a91e017e3 100644 --- a/onedal/datatypes/tests/test_data.py +++ b/onedal/datatypes/tests/test_data.py @@ -14,19 +14,19 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose -from onedal.primitives import linear_kernel from onedal import _backend - +from onedal.primitives import linear_kernel from onedal.tests.utils._device_selection import get_queues try: import dpctl import dpctl.tensor as dpt - dpctl_available = dpctl.__version__ >= '0.14' + + dpctl_available = dpctl.__version__ >= "0.14" except ImportError: dpctl_available = False @@ -35,7 +35,7 @@ def _test_input_format_c_contiguous_numpy(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='C') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="C") assert x_numpy.flags.c_contiguous assert not x_numpy.flags.f_contiguous assert not x_numpy.flags.fnc @@ -46,8 +46,8 @@ def _test_input_format_c_contiguous_numpy(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_contiguous_numpy(queue, dtype): _test_input_format_c_contiguous_numpy(queue, dtype) @@ -56,7 +56,7 @@ def _test_input_format_f_contiguous_numpy(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='F') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="F") assert not x_numpy.flags.c_contiguous assert x_numpy.flags.f_contiguous assert x_numpy.flags.fnc @@ -67,8 +67,8 @@ def _test_input_format_f_contiguous_numpy(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_f_contiguous_numpy(queue, dtype): _test_input_format_f_contiguous_numpy(queue, dtype) @@ -92,18 +92,18 @@ def _test_input_format_c_not_contiguous_numpy(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_not_contiguous_numpy(queue, dtype): _test_input_format_c_not_contiguous_numpy(queue, dtype) def _test_input_format_c_contiguous_pandas(queue, dtype): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='C') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="C") assert x_numpy.flags.c_contiguous assert not x_numpy.flags.f_contiguous assert not x_numpy.flags.fnc @@ -115,18 +115,18 @@ def _test_input_format_c_contiguous_pandas(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_c_contiguous_pandas(queue, dtype): _test_input_format_c_contiguous_pandas(queue, dtype) def _test_input_format_f_contiguous_pandas(queue, dtype): - pd = pytest.importorskip('pandas') + pd = pytest.importorskip("pandas") rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 4)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='F') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="F") assert not x_numpy.flags.c_contiguous assert x_numpy.flags.f_contiguous assert x_numpy.flags.fnc @@ -138,31 +138,32 @@ def _test_input_format_f_contiguous_pandas(queue, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_input_format_f_contiguous_pandas(queue, dtype): _test_input_format_f_contiguous_pandas(queue, dtype) -@pytest.mark.skipif(not dpctl_available, - reason="requires dpctl>=0.14") -@pytest.mark.parametrize('queue', get_queues('cpu,gpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64, np.int32, np.int64]) +@pytest.mark.skipif(not dpctl_available, reason="requires dpctl>=0.14") +@pytest.mark.parametrize("queue", get_queues("cpu,gpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_input_format_c_contiguous_dpctl(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 59)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='C') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="C") x_dpt = dpt.asarray(x_numpy, usm_type="device", sycl_queue=queue) # assert not x_dpt.flags.fnc assert isinstance(x_dpt, dpt.usm_ndarray) x_table = _backend.dpctl_to_table(x_dpt) - assert hasattr(x_table, '__sycl_usm_array_interface__') + assert hasattr(x_table, "__sycl_usm_array_interface__") x_dpt_from_table = dpt.asarray(x_table) - assert x_dpt.__sycl_usm_array_interface__[ - 'data'][0] == x_dpt_from_table.__sycl_usm_array_interface__['data'][0] + assert ( + x_dpt.__sycl_usm_array_interface__["data"][0] + == x_dpt_from_table.__sycl_usm_array_interface__["data"][0] + ) assert x_dpt.shape == x_dpt_from_table.shape assert x_dpt.strides == x_dpt_from_table.strides assert x_dpt.dtype == x_dpt_from_table.dtype @@ -170,25 +171,26 @@ def test_input_format_c_contiguous_dpctl(queue, dtype): assert x_dpt_from_table.flags.c_contiguous -@pytest.mark.skipif(not dpctl_available, - reason="requires dpctl>=0.14") -@pytest.mark.parametrize('queue', get_queues('cpu,gpu')) -@pytest.mark.parametrize('dtype', [np.float32, np.float64, np.int32, np.int64]) +@pytest.mark.skipif(not dpctl_available, reason="requires dpctl>=0.14") +@pytest.mark.parametrize("queue", get_queues("cpu,gpu")) +@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) def test_input_format_f_contiguous_dpctl(queue, dtype): rng = np.random.RandomState(0) x_default = np.array(5 * rng.random_sample((10, 59)), dtype=dtype) - x_numpy = np.asanyarray(x_default, dtype=dtype, order='F') + x_numpy = np.asanyarray(x_default, dtype=dtype, order="F") x_dpt = dpt.asarray(x_numpy, usm_type="device", sycl_queue=queue) # assert not x_dpt.flags.fnc assert isinstance(x_dpt, dpt.usm_ndarray) x_table = _backend.dpctl_to_table(x_dpt) - assert hasattr(x_table, '__sycl_usm_array_interface__') + assert hasattr(x_table, "__sycl_usm_array_interface__") x_dpt_from_table = dpt.asarray(x_table) - assert x_dpt.__sycl_usm_array_interface__[ - 'data'][0] == x_dpt_from_table.__sycl_usm_array_interface__['data'][0] + assert ( + x_dpt.__sycl_usm_array_interface__["data"][0] + == x_dpt_from_table.__sycl_usm_array_interface__["data"][0] + ) assert x_dpt.shape == x_dpt_from_table.shape assert x_dpt.strides == x_dpt_from_table.strides assert x_dpt.dtype == x_dpt_from_table.dtype diff --git a/onedal/decomposition/__init__.py b/onedal/decomposition/__init__.py index eda7b9fc14..618e0b9082 100644 --- a/onedal/decomposition/__init__.py +++ b/onedal/decomposition/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/onedal/decomposition/pca.py b/onedal/decomposition/pca.py index a38a9e2597..b6834c731b 100644 --- a/onedal/decomposition/pca.py +++ b/onedal/decomposition/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,26 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np +from daal4py.sklearn._utils import sklearn_check_version from onedal import _backend + from ..common._policy import _get_policy -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) -from daal4py.sklearn._utils import sklearn_check_version +from ..datatypes import _convert_to_supported, from_table, to_table -class PCA(): +class PCA: def __init__( - self, - n_components=None, - is_deterministic=True, - method='precomputed', - copy=True + self, n_components=None, is_deterministic=True, method="precomputed", copy=True ): self.n_components = n_components self.method = method @@ -39,11 +33,10 @@ def __init__( def get_onedal_params(self, data): return { - 'fptype': - 'float' if data.dtype == np.float32 else 'double', - 'method': self.method, - 'n_components': self.n_components, - 'is_deterministic': self.is_deterministic + "fptype": "float" if data.dtype == np.float32 else "double", + "method": self.method, + "n_components": self.n_components, + "is_deterministic": self.is_deterministic, } def _get_policy(self, queue, *data): @@ -56,34 +49,27 @@ def fit(self, X, queue): policy = self._get_policy(queue, X) # TODO: investigate why np.ndarray with OWNDATA=FALSE flag # fails to be converted to oneDAL table - if isinstance(X, np.ndarray) and not X.flags['OWNDATA']: + if isinstance(X, np.ndarray) and not X.flags["OWNDATA"]: X = X.copy() X = _convert_to_supported(policy, X) params = self.get_onedal_params(X) cov_result = _backend.covariance.compute( - policy, - {'fptype': params['fptype'], 'method': 'dense'}, - to_table(X) + policy, {"fptype": params["fptype"], "method": "dense"}, to_table(X) ) covariance_matrix = from_table(cov_result.cov_matrix) self.mean_ = from_table(cov_result.means) result = _backend.decomposition.dim_reduction.train( - policy, - params, - to_table(covariance_matrix) + policy, params, to_table(covariance_matrix) ) self.n_components_ = self.n_components self.variances_ = from_table(result.variances) self.components_ = from_table(result.eigenvectors) - self.explained_variance_ = \ - np.maximum(from_table(result.eigenvalues).ravel(), 0) + self.explained_variance_ = np.maximum(from_table(result.eigenvalues).ravel(), 0) tot_var = covariance_matrix.trace() self.explained_variance_ratio_ = self.explained_variance_ / tot_var - self.singular_values_ = np.sqrt( - (n_samples - 1) * self.explained_variance_ - ) + self.singular_values_ = np.sqrt((n_samples - 1) * self.explained_variance_) if sklearn_check_version("1.2"): self.n_features_in_ = n_features @@ -96,10 +82,8 @@ def fit(self, X, queue): self.n_samples_ = n_samples if self.n_components < n_sf_min: if self.explained_variance_.shape[0] < n_sf_min: - resid_var_ = tot_var - \ - self.explained_variance_[:self.n_components].sum() - self.noise_variance_ = \ - resid_var_ / (n_sf_min - self.n_components) + resid_var_ = tot_var - self.explained_variance_[: self.n_components].sum() + self.noise_variance_ = resid_var_ / (n_sf_min - self.n_components) return self def _create_model(self): @@ -114,8 +98,7 @@ def predict(self, X, queue): X = _convert_to_supported(policy, X) params = self.get_onedal_params(X) - result = _backend.decomposition.dim_reduction.infer(policy, - params, - model, - to_table(X)) + result = _backend.decomposition.dim_reduction.infer( + policy, params, model, to_table(X) + ) return from_table(result.transformed_data) diff --git a/onedal/ensemble/__init__.py b/onedal/ensemble/__init__.py index 30d18d71b4..86e60b233a 100644 --- a/onedal/ensemble/__init__.py +++ b/onedal/ensemble/__init__.py @@ -14,10 +14,16 @@ # limitations under the License. # =============================================================================== -from .forest import RandomForestClassifier, RandomForestRegressor -from .forest import ExtraTreesClassifier, ExtraTreesRegressor +from .forest import ( + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, +) -__all__ = ['RandomForestClassifier', - 'RandomForestRegressor', - 'ExtraTreesClassifier', - 'ExtraTreesRegressor'] +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "ExtraTreesClassifier", + "ExtraTreesRegressor", +] diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 3fa60d12c4..546bd979d5 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -14,77 +14,75 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version) - -from abc import ABCMeta, abstractmethod import numbers -from numbers import Number import warnings +from abc import ABCMeta, abstractmethod +from math import ceil +from numbers import Number + +import numpy as np +from scipy import sparse as sp +from sklearn.ensemble import BaseEnsemble from sklearn.exceptions import DataConversionWarning from sklearn.utils import ( + check_array, check_random_state, compute_sample_weight, - check_array, - deprecated) + deprecated, +) from sklearn.utils.validation import ( - check_is_fitted, + _num_samples, check_consistent_length, - _num_samples) -from math import ceil + check_is_fitted, +) -import numpy as np -from scipy import sparse as sp +from daal4py.sklearn._utils import daal_check_version, sklearn_check_version +from onedal import _backend +from ..common._estimator_checks import _check_is_fitted from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) +from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import ( - _validate_targets, - _check_X_y, _check_array, + _check_n_features, + _check_X_y, _column_or_1d, - _check_n_features + _validate_targets, ) -from onedal import _backend - -from sklearn.ensemble import BaseEnsemble class BaseForest(BaseEnsemble, metaclass=ABCMeta): @abstractmethod def __init__( - self, - n_estimators, - criterion, - max_depth, - min_samples_split, - min_samples_leaf, - min_weight_fraction_leaf, - max_features, - max_leaf_nodes, - min_impurity_decrease, - min_impurity_split, - bootstrap, - oob_score, - random_state, - warm_start, - class_weight, - ccp_alpha, - max_samples, - max_bins, - min_bin_size, - infer_mode, - splitter_mode, - voting_mode, - error_metric_mode, - variable_importance_mode, - algorithm, - **kwargs): + self, + n_estimators, + criterion, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_features, + max_leaf_nodes, + min_impurity_decrease, + min_impurity_split, + bootstrap, + oob_score, + random_state, + warm_start, + class_weight, + ccp_alpha, + max_samples, + max_bins, + min_bin_size, + infer_mode, + splitter_mode, + voting_mode, + error_metric_mode, + variable_importance_mode, + algorithm, + **kwargs, + ): self.n_estimators = n_estimators self.bootstrap = bootstrap self.oob_score = oob_score @@ -111,32 +109,41 @@ def __init__( self.variable_importance_mode = variable_importance_mode self.algorithm = algorithm - def _to_absolute_max_features(self, max_features, n_features, - is_classification=False): + def _to_absolute_max_features( + self, max_features, n_features, is_classification=False + ): if max_features is None: return n_features if isinstance(max_features, str): if max_features == "auto": - if not sklearn_check_version('1.3'): - if sklearn_check_version('1.1'): + if not sklearn_check_version("1.3"): + if sklearn_check_version("1.1"): warnings.warn( "`max_features='auto'` has been deprecated in 1.1 " "and will be removed in 1.3. To keep the past behaviour, " "explicitly set `max_features=1.0` or remove this " "parameter as it is also the default value for " "RandomForestRegressors and ExtraTreesRegressors.", - FutureWarning, ) - return max(1, int(np.sqrt(n_features)) - ) if is_classification else n_features - if max_features == 'sqrt': + FutureWarning, + ) + return ( + max(1, int(np.sqrt(n_features))) + if is_classification + else n_features + ) + if max_features == "sqrt": return max(1, int(np.sqrt(n_features))) if max_features == "log2": return max(1, int(np.log2(n_features))) - allowed_string_values = '"sqrt" or "log2"' if sklearn_check_version( - '1.3') else '"auto", "sqrt" or "log2"' + allowed_string_values = ( + '"sqrt" or "log2"' + if sklearn_check_version("1.3") + else '"auto", "sqrt" or "log2"' + ) raise ValueError( - 'Invalid value for max_features. Allowed string ' - f'values are {allowed_string_values}.') + "Invalid value for max_features. Allowed string " + f"values are {allowed_string_values}." + ) if isinstance(max_features, (numbers.Integral, np.integer)): return max_features if max_features > 0.0: @@ -145,10 +152,10 @@ def _to_absolute_max_features(self, max_features, n_features, def _get_observations_per_tree_fraction(self, n_samples, max_samples): if max_samples is None: - return 1. + return 1.0 if isinstance(max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, max_samples)) @@ -159,9 +166,9 @@ def _get_observations_per_tree_fraction(self, n_samples, max_samples): return max(float(max_samples / n_samples), 1 / n_samples) if isinstance(max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(max_samples)) @@ -177,12 +184,15 @@ def _get_observations_per_tree_fraction(self, n_samples, max_samples): def _get_onedal_params(self, data): n_samples, n_features = data.shape features_per_node = self._to_absolute_max_features( - self.max_features, n_features, self.is_classification) + self.max_features, n_features, self.is_classification + ) observations_per_tree_fraction = self._get_observations_per_tree_fraction( - n_samples=n_samples, max_samples=self.max_samples) - observations_per_tree_fraction = observations_per_tree_fraction if bool( - self.bootstrap) else 1. + n_samples=n_samples, max_samples=self.max_samples + ) + observations_per_tree_fraction = ( + observations_per_tree_fraction if bool(self.bootstrap) else 1.0 + ) if not self.bootstrap and self.max_samples is not None: raise ValueError( @@ -191,116 +201,126 @@ def _get_onedal_params(self, data): "`max_sample=None`." ) if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") min_observations_in_leaf_node = ( - self.min_samples_leaf if isinstance( - self.min_samples_leaf, - numbers.Integral) else int( - ceil( - self.min_samples_leaf * n_samples))) + self.min_samples_leaf + if isinstance(self.min_samples_leaf, numbers.Integral) + else int(ceil(self.min_samples_leaf * n_samples)) + ) min_observations_in_split_node = ( - self.min_samples_split if isinstance( - self.min_samples_split, - numbers.Integral) else int( - ceil( - self.min_samples_split * n_samples))) + self.min_samples_split + if isinstance(self.min_samples_split, numbers.Integral) + else int(ceil(self.min_samples_split * n_samples)) + ) onedal_params = { - 'fptype': 'float' if data.dtype == np.float32 else 'double', - 'method': self.algorithm, - 'infer_mode': self.infer_mode, - 'voting_mode': self.voting_mode, - 'observations_per_tree_fraction': observations_per_tree_fraction, - 'impurity_threshold': float( - 0.0 if self.min_impurity_split is None else self.min_impurity_split), - 'min_weight_fraction_in_leaf_node': self.min_weight_fraction_leaf, - 'min_impurity_decrease_in_split_node': self.min_impurity_decrease, - 'tree_count': int(self.n_estimators), - 'features_per_node': features_per_node, - 'max_tree_depth': int(0 if self.max_depth is None else self.max_depth), - 'min_observations_in_leaf_node': min_observations_in_leaf_node, - 'min_observations_in_split_node': min_observations_in_split_node, - 'max_leaf_nodes': (0 if self.max_leaf_nodes is None else self.max_leaf_nodes), - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'memory_saving_mode': False, - 'bootstrap': bool(self.bootstrap), - 'error_metric_mode': self.error_metric_mode, - 'variable_importance_mode': self.variable_importance_mode, + "fptype": "float" if data.dtype == np.float32 else "double", + "method": self.algorithm, + "infer_mode": self.infer_mode, + "voting_mode": self.voting_mode, + "observations_per_tree_fraction": observations_per_tree_fraction, + "impurity_threshold": float( + 0.0 if self.min_impurity_split is None else self.min_impurity_split + ), + "min_weight_fraction_in_leaf_node": self.min_weight_fraction_leaf, + "min_impurity_decrease_in_split_node": self.min_impurity_decrease, + "tree_count": int(self.n_estimators), + "features_per_node": features_per_node, + "max_tree_depth": int(0 if self.max_depth is None else self.max_depth), + "min_observations_in_leaf_node": min_observations_in_leaf_node, + "min_observations_in_split_node": min_observations_in_split_node, + "max_leaf_nodes": (0 if self.max_leaf_nodes is None else self.max_leaf_nodes), + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "memory_saving_mode": False, + "bootstrap": bool(self.bootstrap), + "error_metric_mode": self.error_metric_mode, + "variable_importance_mode": self.variable_importance_mode, } if self.is_classification: - onedal_params['class_count'] = 0 if self.classes_ is None else len( - self.classes_) - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + onedal_params["class_count"] = ( + 0 if self.classes_ is None else len(self.classes_) + ) + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode return onedal_params def _check_parameters(self): if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if self.min_impurity_split is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if self.min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if self.min_impurity_split < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def _validate_targets(self, y, dtype): self.class_weight_ = None @@ -313,18 +333,20 @@ def _get_sample_weight(self, X, y, sample_weight): if n_samples == 1: raise ValueError("n_samples=1") - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=dtype) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=dtype + ) sample_weight = sample_weight.ravel() sample_weight_count = sample_weight.shape[0] if sample_weight_count != 0 and sample_weight_count != n_samples: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (len(sample_weight), X.shape)) + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (len(sample_weight), X.shape) + ) if sample_weight_count == 0: sample_weight = np.ones(n_samples, dtype=dtype) @@ -332,15 +354,21 @@ def _get_sample_weight(self, X, y, sample_weight): sample_weight = np.full(n_samples, sample_weight, dtype=dtype) else: sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" + sample_weight, + accept_sparse=False, + ensure_2d=False, + dtype=dtype, + order="C", ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight def _get_policy(self, queue, *data): @@ -348,27 +376,31 @@ def _get_policy(self, queue, *data): def _fit(self, X, y, sample_weight, module, queue): X, y = _check_X_y( - X, y, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse='csr') + X, + y, + dtype=[np.float64, np.float32], + force_all_finite=True, + accept_sparse="csr", + ) y = self._validate_targets(y, X.dtype) sample_weight = self._get_sample_weight(X, y, sample_weight) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ policy = self._get_policy(queue, X, y, sample_weight) X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight) params = self._get_onedal_params(X) - train_result = module.train( - policy, params, *to_table(X, y, sample_weight)) + train_result = module.train(policy, params, *to_table(X, y, sample_weight)) self._onedal_model = train_result.model if self.oob_score: if self.is_classification: self.oob_score_ = from_table(train_result.oob_err_accuracy)[0, 0] self.oob_decision_function_ = from_table( - train_result.oob_err_decision_function) + train_result.oob_err_decision_function + ) if np.any(self.oob_decision_function_ == 0): warnings.warn( "Some inputs do not have OOB scores. This probably means " @@ -379,7 +411,8 @@ def _fit(self, X, y, sample_weight, module, queue): else: self.oob_score_ = from_table(train_result.oob_err_r2)[0, 0] self.oob_prediction_ = from_table( - train_result.oob_err_prediction).reshape(-1) + train_result.oob_err_prediction + ).reshape(-1) if np.any(self.oob_prediction_ == 0): warnings.warn( "Some inputs do not have OOB scores. This probably means " @@ -393,12 +426,13 @@ def _fit(self, X, y, sample_weight, module, queue): def _create_model(self, module): # TODO: # upate error msg. - raise NotImplementedError('Creating model is not supported.') + raise NotImplementedError("Creating model is not supported.") def _predict(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse=False) + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse=False + ) _check_n_features(self, X, False) policy = self._get_policy(queue, X) @@ -411,13 +445,14 @@ def _predict(self, X, module, queue): def _predict_proba(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse=False) + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse=False + ) _check_n_features(self, X, False) policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) params = self._get_onedal_params(X) - params['infer_mode'] = 'class_probabilities' + params["infer_mode"] = "class_probabilities" model = self._onedal_model result = module.infer(policy, params, model, to_table(X)) @@ -426,33 +461,35 @@ def _predict_proba(self, X, module, queue): class RandomForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='best', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="best", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -478,12 +515,14 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = True def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) # Decapsulate classes_ attributes # TODO: @@ -493,50 +532,49 @@ def _validate_targets(self, y, dtype): return y def fit(self, X, y, sample_weight=None, queue=None): - return self._fit(X, y, sample_weight, - _backend.decision_forest.classification, queue) + return self._fit( + X, y, sample_weight, _backend.decision_forest.classification, queue + ) def predict(self, X, queue=None): pred = super()._predict(X, _backend.decision_forest.classification, queue) - return np.take( - self.classes_, - pred.ravel().astype( - np.int64, - casting='unsafe')) + return np.take(self.classes_, pred.ravel().astype(np.int64, casting="unsafe")) def predict_proba(self, X, queue=None): return super()._predict_proba(X, _backend.decision_forest.classification, queue) class RandomForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='best', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="best", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -562,49 +600,53 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = False def fit(self, X, y, sample_weight=None, queue=None): if sample_weight is not None: - if hasattr(sample_weight, '__array__'): + if hasattr(sample_weight, "__array__"): sample_weight[sample_weight == 0.0] = 1.0 sample_weight = [sample_weight] - return super()._fit(X, y, sample_weight, - _backend.decision_forest.regression, queue) + return super()._fit( + X, y, sample_weight, _backend.decision_forest.regression, queue + ) def predict(self, X, queue=None): return super()._predict(X, _backend.decision_forest.regression, queue).ravel() class ExtraTreesClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='random', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="random", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -630,12 +672,14 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = True def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) # Decapsulate classes_ attributes # TODO: @@ -645,50 +689,49 @@ def _validate_targets(self, y, dtype): return y def fit(self, X, y, sample_weight=None, queue=None): - return self._fit(X, y, sample_weight, - _backend.decision_forest.classification, queue) + return self._fit( + X, y, sample_weight, _backend.decision_forest.classification, queue + ) def predict(self, X, queue=None): pred = super()._predict(X, _backend.decision_forest.classification, queue) - return np.take( - self.classes_, - pred.ravel().astype( - np.int64, - casting='unsafe')) + return np.take(self.classes_, pred.ravel().astype(np.int64, casting="unsafe")) def predict_proba(self, X, queue=None): return super()._predict_proba(X, _backend.decision_forest.classification, queue) class ExtraTreesRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='random', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="random", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -714,16 +757,18 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = False def fit(self, X, y, sample_weight=None, queue=None): if sample_weight is not None: - if hasattr(sample_weight, '__array__'): + if hasattr(sample_weight, "__array__"): sample_weight[sample_weight == 0.0] = 1.0 sample_weight = [sample_weight] - return super()._fit(X, y, sample_weight, - _backend.decision_forest.regression, queue) + return super()._fit( + X, y, sample_weight, _backend.decision_forest.regression, queue + ) def predict(self, X, queue=None): return super()._predict(X, _backend.decision_forest.regression, queue).ravel() diff --git a/onedal/ensemble/tests/test_random_forest.py b/onedal/ensemble/tests/test_random_forest.py index 84fab6ea44..317c63556a 100644 --- a/onedal/ensemble/tests/test_random_forest.py +++ b/onedal/ensemble/tests/test_random_forest.py @@ -14,56 +14,64 @@ # limitations under the License. # =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose +from sklearn.datasets import make_classification, make_regression from daal4py.sklearn._utils import daal_check_version from onedal.ensemble import RandomForestClassifier, RandomForestRegressor from onedal.tests.utils._device_selection import get_queues -from sklearn.datasets import make_classification, make_regression - -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_rf_classifier(queue): - X, y = make_classification(n_samples=100, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) - rf = RandomForestClassifier( - max_depth=2, random_state=0).fit(X, y, queue=queue) + X, y = make_classification( + n_samples=100, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) + rf = RandomForestClassifier(max_depth=2, random_state=0).fit(X, y, queue=queue) assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue)) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_rf_regression(queue): - X, y = make_regression(n_samples=100, n_features=4, n_informative=2, - random_state=0, shuffle=False) - rf = RandomForestRegressor( - max_depth=2, random_state=0).fit(X, y, queue=queue) - assert_allclose( - [-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) + X, y = make_regression( + n_samples=100, n_features=4, n_informative=2, random_state=0, shuffle=False + ) + rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y, queue=queue) + assert_allclose([-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) -@pytest.mark.skipif(not daal_check_version((2023, 'P', 101)), - reason='requires OneDAL 2023.1.1') -@pytest.mark.parametrize('queue', get_queues('gpu')) +@pytest.mark.skipif( + not daal_check_version((2023, "P", 101)), reason="requires OneDAL 2023.1.1" +) +@pytest.mark.parametrize("queue", get_queues("gpu")) def test_rf_classifier_random_splitter(queue): - X, y = make_classification(n_samples=100, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) - rf = RandomForestClassifier( - max_depth=2, random_state=0, - splitter_mode='random').fit(X, y, queue=queue) + X, y = make_classification( + n_samples=100, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) + rf = RandomForestClassifier(max_depth=2, random_state=0, splitter_mode="random").fit( + X, y, queue=queue + ) assert_allclose([1], rf.predict([[0, 0, 0, 0]], queue=queue)) -@pytest.mark.parametrize('queue', get_queues('gpu')) +@pytest.mark.parametrize("queue", get_queues("gpu")) def test_rf_regression_random_splitter(queue): - X, y = make_regression(n_samples=100, n_features=4, n_informative=2, - random_state=0, shuffle=False) - rf = RandomForestRegressor( - max_depth=2, random_state=0, - splitter_mode='random').fit(X, y, queue=queue) - assert_allclose( - [-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) + X, y = make_regression( + n_samples=100, n_features=4, n_informative=2, random_state=0, shuffle=False + ) + rf = RandomForestRegressor(max_depth=2, random_state=0, splitter_mode="random").fit( + X, y, queue=queue + ) + assert_allclose([-6.83], rf.predict([[0, 0, 0, 0]], queue=queue), atol=1e-2) diff --git a/onedal/linear_model/__init__.py b/onedal/linear_model/__init__.py index b7be0fbcf4..ee4de6210c 100755 --- a/onedal/linear_model/__init__.py +++ b/onedal/linear_model/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear_model import LinearRegression -__all__ = ['LinearRegression'] +__all__ = ["LinearRegression"] diff --git a/onedal/linear_model/linear_model.py b/onedal/linear_model/linear_model.py index db40483e61..3c9a310bff 100755 --- a/onedal/linear_model/linear_model.py +++ b/onedal/linear_model/linear_model.py @@ -14,27 +14,20 @@ # limitations under the License. # =============================================================================== -from sklearn.base import BaseEstimator from abc import ABCMeta, abstractmethod +from numbers import Number import numpy as np -from numbers import Number +from sklearn.base import BaseEstimator -from daal4py.sklearn._utils import (get_dtype, make2d) +from daal4py.sklearn._utils import get_dtype, make2d +from onedal import _backend +from ..common._estimator_checks import _check_is_fitted from ..common._mixin import RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) -from ..utils import ( - _check_X_y, - _num_features, - _check_array, - _check_n_features) -from onedal import _backend +from ..datatypes import _convert_to_supported, from_table, to_table +from ..utils import _check_array, _check_n_features, _check_X_y, _num_features class BaseLinearRegression(BaseEstimator, metaclass=ABCMeta): @@ -48,11 +41,12 @@ def _get_policy(self, queue, *data): return _get_policy(queue, *data) def _get_onedal_params(self, dtype=np.float32): - intercept = 'intercept|' if self.fit_intercept else '' + intercept = "intercept|" if self.fit_intercept else "" return { - 'fptype': 'float' if dtype == np.float32 else 'double', - 'method': self.algorithm, 'intercept': self.fit_intercept, - 'result_option': (intercept + 'coefficients'), + "fptype": "float" if dtype == np.float32 else "double", + "method": self.algorithm, + "intercept": self.fit_intercept, + "result_option": (intercept + "coefficients"), } def _fit(self, X, y, module, queue): @@ -70,8 +64,7 @@ def _fit(self, X, y, module, queue): y_loc = np.asarray(y_loc).astype(dtype=dtype) # Finiteness is checked in the sklearnex wrapper - X_loc, y_loc = _check_X_y( - X_loc, y_loc, force_all_finite=False, accept_2d_y=True) + X_loc, y_loc = _check_X_y(X_loc, y_loc, force_all_finite=False, accept_2d_y=True) self.n_features_in_ = _num_features(X_loc, fallback_1d=True) @@ -115,14 +108,18 @@ def _create_model(self, module, policy): intercept = np.asarray(intercept, dtype=dtype) assert n_targets_in == intercept.size - intercept = _check_array(intercept, dtype=[np.float64, np.float32], - force_all_finite=True, ensure_2d=False) + intercept = _check_array( + intercept, + dtype=[np.float64, np.float32], + force_all_finite=True, + ensure_2d=False, + ) coefficients = _check_array( coefficients, - dtype=[ - np.float64, - np.float32], - force_all_finite=True, ensure_2d=False) + dtype=[np.float64, np.float32], + force_all_finite=True, + ensure_2d=False, + ) coefficients, intercept = make2d(coefficients), make2d(intercept) coefficients = coefficients.T if n_targets_in == 1 else coefficients @@ -156,11 +153,12 @@ def _predict(self, X, module, queue): X_loc = X # Finiteness is checked in the sklearnex wrapper - X_loc = _check_array(X_loc, dtype=[np.float64, np.float32], - force_all_finite=False, ensure_2d=False) + X_loc = _check_array( + X_loc, dtype=[np.float64, np.float32], force_all_finite=False, ensure_2d=False + ) _check_n_features(self, X_loc, False) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(module, policy) @@ -190,12 +188,8 @@ class LinearRegression(RegressorMixin, BaseLinearRegression): """ def __init__( - self, - fit_intercept=True, - copy_X=False, - *, - algorithm='norm_eq', - **kwargs): + self, fit_intercept=True, copy_X=False, *, algorithm="norm_eq", **kwargs + ): super().__init__(fit_intercept=fit_intercept, copy_X=copy_X, algorithm=algorithm) def fit(self, X, y, queue=None): diff --git a/onedal/linear_model/tests/test_linear_regression.py b/onedal/linear_model/tests/test_linear_regression.py index 6809ef30b3..5962c530b1 100755 --- a/onedal/linear_model/tests/test_linear_regression.py +++ b/onedal/linear_model/tests/test_linear_regression.py @@ -16,33 +16,32 @@ from daal4py.sklearn._utils import daal_check_version, sklearn_check_version -if daal_check_version((2023, 'P', 100)): - import pytest +if daal_check_version((2023, "P", 100)): import numpy as np + import pytest from numpy.testing import assert_allclose, assert_array_equal - - from onedal.linear_model import LinearRegression - from onedal.tests.utils._device_selection import get_queues - from sklearn.datasets import load_diabetes from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + from onedal.linear_model import LinearRegression + from onedal.tests.utils._device_selection import get_queues + + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_diabetes(queue, dtype): X, y = load_diabetes(return_X_y=True) X, y = X.astype(dtype), y.astype(dtype) - X_train, X_test, y_train, y_test = \ - train_test_split(X, y, - train_size=0.8, random_state=777) + X_train, X_test, y_train, y_test = train_test_split( + X, y, train_size=0.8, random_state=777 + ) model = LinearRegression(fit_intercept=True) model.fit(X_train, y_train, queue=queue) y_pred = model.predict(X_test, queue=queue) assert mean_squared_error(y_test, y_pred) < 2396 - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_pickle(queue, dtype): X, y = load_diabetes(return_X_y=True) X, y = X.astype(dtype), y.astype(dtype) @@ -51,6 +50,7 @@ def test_pickle(queue, dtype): expected = model.predict(X, queue=queue) import pickle + dump = pickle.dumps(model) model2 = pickle.loads(dump) @@ -59,8 +59,8 @@ def test_pickle(queue, dtype): assert_array_equal(expected, result) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_full_results(queue, dtype): seed = 42 f_count, r_count = 19, 7 @@ -90,8 +90,8 @@ def test_full_results(queue, dtype): tol = 2e-4 if res.dtype == np.float32 else 1e-7 assert_allclose(gtr, res, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_no_intercept_results(queue, dtype): seed = 42 f_count, r_count = 19, 7 @@ -117,8 +117,8 @@ def test_no_intercept_results(queue, dtype): tol = 5e-5 if res.dtype == np.float32 else 1e-7 assert_allclose(gtr, res, rtol=tol) - @pytest.mark.parametrize('queue', get_queues()) - @pytest.mark.parametrize('dtype', [np.float32, np.float64]) + @pytest.mark.parametrize("queue", get_queues()) + @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_reconstruct_model(queue, dtype): seed = 42 s_count = 3500 diff --git a/onedal/neighbors/__init__.py b/onedal/neighbors/__init__.py index c535172bb0..a8aede3c4d 100755 --- a/onedal/neighbors/__init__.py +++ b/onedal/neighbors/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'NearestNeighbors'] +__all__ = ["KNeighborsClassifier", "KNeighborsRegressor", "NearestNeighbors"] diff --git a/onedal/neighbors/neighbors.py b/onedal/neighbors/neighbors.py index ef21658c33..52f73f8fa9 100755 --- a/onedal/neighbors/neighbors.py +++ b/onedal/neighbors/neighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,36 +12,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABCMeta - from numbers import Integral import numpy as np from daal4py import ( - bf_knn_classification_training, bf_knn_classification_prediction, + bf_knn_classification_training, + kdtree_knn_classification_prediction, kdtree_knn_classification_training, - kdtree_knn_classification_prediction ) from onedal import _backend +from ..common._estimator_checks import _check_is_fitted, _is_classifier, _is_regressor from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted, _is_classifier, _is_regressor -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) +from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import ( - _check_X_y, _check_array, - _column_or_1d, - _check_n_features, _check_classification_targets, - _num_samples + _check_n_features, + _check_X_y, + _column_or_1d, + _num_samples, ) @@ -52,21 +48,23 @@ def _get_policy(self, queue, *data): def _parse_auto_method(self, method, n_samples, n_features): result_method = method - if (method in ['auto', 'ball_tree']): - condition = self.n_neighbors is not None and \ - self.n_neighbors >= n_samples // 2 - if self.metric == 'precomputed' or n_features > 15 or condition: - result_method = 'brute' + if method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None and self.n_neighbors >= n_samples // 2 + ) + if self.metric == "precomputed" or n_features > 15 or condition: + result_method = "brute" else: - if self.metric == 'euclidean': - result_method = 'kd_tree' + if self.metric == "euclidean": + result_method = "kd_tree" else: - result_method = 'brute' + result_method = "brute" return result_method - def _validate_data(self, X, y=None, reset=True, - validate_separately=False, **check_params): + def _validate_data( + self, X, y=None, reset=True, validate_separately=False, **check_params + ): if y is None: if self.requires_y: raise ValueError( @@ -88,7 +86,7 @@ def _validate_data(self, X, y=None, reset=True, X, y = _check_X_y(X, y, **check_params) out = X, y - if check_params.get('ensure_2d', True): + if check_params.get("ensure_2d", True): _check_n_features(self, X, reset=reset) return out @@ -126,42 +124,48 @@ def _get_weights(self, dist, weights): def _get_onedal_params(self, X, y=None): class_count = 0 if self.classes_ is None else len(self.classes_) - weights = getattr(self, 'weights', 'uniform') + weights = getattr(self, "weights", "uniform") return { - 'fptype': 'float' if X.dtype == np.float32 else 'double', - 'vote_weights': 'uniform' if weights == 'uniform' else 'distance', - 'method': self._fit_method, - 'radius': self.radius, - 'class_count': class_count, - 'neighbor_count': self.n_neighbors, - 'metric': self.effective_metric_, - 'p': self.p, - 'metric_params': self.effective_metric_params_, - 'result_option': 'indices|distances' if y is None else 'responses', + "fptype": "float" if X.dtype == np.float32 else "double", + "vote_weights": "uniform" if weights == "uniform" else "distance", + "method": self._fit_method, + "radius": self.radius, + "class_count": class_count, + "neighbor_count": self.n_neighbors, + "metric": self.effective_metric_, + "p": self.p, + "metric_params": self.effective_metric_params_, + "result_option": "indices|distances" if y is None else "responses", } def _get_daal_params(self, data): class_count = 0 if self.classes_ is None else len(self.classes_) - weights = getattr(self, 'weights', 'uniform') + weights = getattr(self, "weights", "uniform") params = { - 'fptype': 'float' if data.dtype == np.float32 else 'double', - 'method': 'defaultDense', - 'k': self.n_neighbors, - 'voteWeights': 'voteUniform' if weights == 'uniform' else 'voteDistance', - 'resultsToCompute': 'computeIndicesOfNeighbors|computeDistances', - 'resultsToEvaluate': 'none' - if getattr(self, '_y', None) is None or _is_regressor(self) - else 'computeClassLabels' + "fptype": "float" if data.dtype == np.float32 else "double", + "method": "defaultDense", + "k": self.n_neighbors, + "voteWeights": "voteUniform" if weights == "uniform" else "voteDistance", + "resultsToCompute": "computeIndicesOfNeighbors|computeDistances", + "resultsToEvaluate": "none" + if getattr(self, "_y", None) is None or _is_regressor(self) + else "computeClassLabels", } if class_count != 0: - params['nClasses'] = class_count + params["nClasses"] = class_count return params class NeighborsBase(NeighborsCommonBase, metaclass=ABCMeta): - def __init__(self, n_neighbors=None, radius=None, - algorithm='auto', metric='minkowski', - p=2, metric_params=None): + def __init__( + self, + n_neighbors=None, + radius=None, + algorithm="auto", + metric="minkowski", + p=2, + metric_params=None, + ): self.n_neighbors = n_neighbors self.radius = radius self.algorithm = algorithm @@ -181,19 +185,21 @@ def _validate_n_classes(self): if len(self.classes_) < 2: raise ValueError( "The number of classes has to be greater than one; got %d" - " class" % len(self.classes_)) + " class" % len(self.classes_) + ) def _fit(self, X, y, queue): self._onedal_model = None self._tree = None self._shape = None self.classes_ = None - self.effective_metric_ = getattr(self, 'effective_metric_', self.metric) + self.effective_metric_ = getattr(self, "effective_metric_", self.metric) self.effective_metric_params_ = getattr( - self, 'effective_metric_params_', self.metric_params) + self, "effective_metric_params_", self.metric_params + ) if y is not None or self.requires_y: - shape = getattr(y, 'shape', None) + shape = getattr(y, "shape", None) X, y = super()._validate_data(X, y, dtype=[np.float64, np.float32]) self._shape = shape if shape is not None else y.shape @@ -208,8 +214,7 @@ def _fit(self, X, y, queue): self.classes_ = [] self._y = np.empty(y.shape, dtype=int) for k in range(self._y.shape[1]): - classes, self._y[:, k] = np.unique( - y[:, k], return_inverse=True) + classes, self._y[:, k] = np.unique(y[:, k], return_inverse=True) self.classes_.append(classes) if not self.outputs_2d_: @@ -228,19 +233,16 @@ def _fit(self, X, y, queue): if self.n_neighbors is not None: if self.n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - self.n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) if not isinstance(self.n_neighbors, Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(self.n_neighbors)) + "enter integer value" % type(self.n_neighbors) + ) self._fit_method = super()._parse_auto_method( - self.algorithm, - self.n_samples_fit_, self.n_features_in_) + self.algorithm, self.n_samples_fit_, self.n_features_in_ + ) _fit_y = None gpu_device = queue is not None and queue.sycl_device.is_gpu @@ -257,34 +259,34 @@ def _fit(self, X, y, queue): return result - def _kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): - n_features = getattr(self, 'n_features_in_', None) - shape = getattr(X, 'shape', None) + def _kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): + n_features = getattr(self, "n_features_in_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but kneighbors is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but kneighbors is expecting " + f"{n_features} features as input" + ) + ) _check_is_fitted(self) if n_neighbors is None: n_neighbors = self.n_neighbors elif n_neighbors <= 0: - raise ValueError( - "Expected n_neighbors > 0. Got %d" % - n_neighbors - ) + raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors) else: if not isinstance(n_neighbors, Integral): raise TypeError( "n_neighbors does not take %s value, " - "enter integer value" % - type(n_neighbors)) + "enter integer value" % type(n_neighbors) + ) if X is not None: query_is_train = False - X = _check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) else: query_is_train = True X = self._fit_X @@ -297,31 +299,32 @@ def _kneighbors(self, X=None, n_neighbors=None, if n_neighbors > n_samples_fit: raise ValueError( "Expected n_neighbors <= n_samples, " - " but n_samples = %d, n_neighbors = %d" % - (n_samples_fit, n_neighbors) + " but n_samples = %d, n_neighbors = %d" % (n_samples_fit, n_neighbors) ) chunked_results = None method = super()._parse_auto_method( - self._fit_method, self.n_samples_fit_, n_features) + self._fit_method, self.n_samples_fit_, n_features + ) gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = super()._get_daal_params(X) else: params = super()._get_onedal_params(X) prediction_results = self._onedal_predict( - self._onedal_model, X, params, queue=queue) + self._onedal_model, X, params, queue=queue + ) - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: distances = prediction_results.distances indices = prediction_results.indices else: distances = from_table(prediction_results.distances) indices = from_table(prediction_results.indices) - if method == 'kd_tree': + if method == "kd_tree": for i in range(distances.shape[0]): seq = distances[i].argsort() indices[i] = indices[i][seq] @@ -363,26 +366,34 @@ def _kneighbors(self, X=None, n_neighbors=None, dup_gr_nbrs = np.all(sample_mask, axis=1) sample_mask[:, 0][dup_gr_nbrs] = False - neigh_ind = np.reshape( - neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) + neigh_ind = np.reshape(neigh_ind[sample_mask], (n_queries, n_neighbors - 1)) if return_distance: - neigh_dist = np.reshape( - neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) + neigh_dist = np.reshape(neigh_dist[sample_mask], (n_queries, n_neighbors - 1)) return neigh_dist, neigh_ind return neigh_ind class KNeighborsClassifier(NeighborsBase, ClassifierMixin): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', - p=2, metric='minkowski', metric_params=None, **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + p=2, + metric="minkowski", + metric_params=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - metric=metric, p=p, + metric=metric, + p=p, metric_params=metric_params, - **kwargs) + **kwargs, + ) self.weights = weights def _get_onedal_params(self, X, y=None): @@ -391,15 +402,15 @@ def _get_onedal_params(self, X, y=None): def _get_daal_params(self, data): params = super()._get_daal_params(data) - params['resultsToEvaluate'] = 'computeClassLabels' - params['resultsToCompute'] = '' + params["resultsToEvaluate"] = "computeClassLabels" + params["resultsToCompute"] = "" return params def _onedal_fit(self, X, y, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) - if self._fit_method == 'brute': + if self._fit_method == "brute": train_alg = bf_knn_classification_training else: @@ -410,15 +421,16 @@ def _onedal_fit(self, X, y, queue): policy = self._get_policy(queue, X, y) X, y = _convert_to_supported(policy, X, y) params = self._get_onedal_params(X, y) - train_alg = _backend.neighbors.classification.train(policy, params, - *to_table(X, y)) + train_alg = _backend.neighbors.classification.train( + policy, params, *to_table(X, y) + ) return train_alg.model def _onedal_predict(self, model, X, params, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: - if self._fit_method == 'brute': + if self.effective_metric_ == "euclidean" and not gpu_device: + if self._fit_method == "brute": predict_alg = bf_knn_classification_prediction else: @@ -428,15 +440,16 @@ def _onedal_predict(self, model, X, params, queue): policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(_backend.neighbors.classification) - if 'responses' not in params['result_option']: - params['result_option'] += '|responses' - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + if "responses" not in params["result_option"]: + params["result_option"] += "|responses" + params["fptype"] = "float" if X.dtype == np.float32 else "double" result = _backend.neighbors.classification.infer( - policy, params, model, to_table(X)) + policy, params, model, to_table(X) + ) return result @@ -444,37 +457,40 @@ def fit(self, X, y, queue=None): return super()._fit(X, y, queue=queue) def predict(self, X, queue=None): - X = _check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) - onedal_model = getattr(self, '_onedal_model', None) - n_features = getattr(self, 'n_features_in_', None) - n_samples_fit_ = getattr(self, 'n_samples_fit_', None) - shape = getattr(X, 'shape', None) + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + onedal_model = getattr(self, "_onedal_model", None) + n_features = getattr(self, "n_features_in_", None) + n_samples_fit_ = getattr(self, "n_samples_fit_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but KNNClassifier is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) _check_is_fitted(self) self._fit_method = super()._parse_auto_method( - self.algorithm, - n_samples_fit_, n_features) + self.algorithm, n_samples_fit_, n_features + ) self._validate_n_classes() gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) else: params = self._get_onedal_params(X) prediction_result = self._onedal_predict(onedal_model, X, params, queue=queue) - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: responses = prediction_result.prediction else: responses = from_table(prediction_result.responses) - result = self.classes_.take( - np.asarray(responses.ravel(), dtype=np.intp)) + result = self.classes_.take(np.asarray(responses.ravel(), dtype=np.intp)) return result @@ -515,21 +531,30 @@ def predict_proba(self, X, queue=None): return probabilities - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super()._kneighbors(X, n_neighbors, return_distance, queue=queue) class KNeighborsRegressor(NeighborsBase, RegressorMixin): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', - p=2, metric='minkowski', metric_params=None, **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + p=2, + metric="minkowski", + metric_params=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - metric=metric, p=p, + metric=metric, + p=p, metric_params=metric_params, - **kwargs) + **kwargs, + ) self.weights = weights def _get_onedal_params(self, X, y=None): @@ -538,15 +563,15 @@ def _get_onedal_params(self, X, y=None): def _get_daal_params(self, data): params = super()._get_daal_params(data) - params['resultsToCompute'] = 'computeIndicesOfNeighbors|computeDistances' - params['resultsToEvaluate'] = 'none' + params["resultsToCompute"] = "computeIndicesOfNeighbors|computeDistances" + params["resultsToEvaluate"] = "none" return params def _onedal_fit(self, X, y, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) - if self._fit_method == 'brute': + if self._fit_method == "brute": train_alg = bf_knn_classification_training else: @@ -566,8 +591,8 @@ def _onedal_fit(self, X, y, queue): def _onedal_predict(self, model, X, params, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: - if self._fit_method == 'brute': + if self.effective_metric_ == "euclidean" and not gpu_device: + if self._fit_method == "brute": predict_alg = bf_knn_classification_prediction else: @@ -577,16 +602,17 @@ def _onedal_predict(self, model, X, params, queue): policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) - backend = _backend.neighbors.regression if gpu_device \ - else _backend.neighbors.search + backend = ( + _backend.neighbors.regression if gpu_device else _backend.neighbors.search + ) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(backend) - if 'responses' not in params['result_option'] and gpu_device: - params['result_option'] += '|responses' - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + if "responses" not in params["result_option"] and gpu_device: + params["result_option"] += "|responses" + params["fptype"] = "float" if X.dtype == np.float32 else "double" result = backend.infer(policy, params, model, to_table(X)) return result @@ -594,26 +620,29 @@ def _onedal_predict(self, model, X, params, queue): def fit(self, X, y, queue=None): return super()._fit(X, y, queue=queue) - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super()._kneighbors(X, n_neighbors, return_distance, queue=queue) def _predict_gpu(self, X, queue=None): - X = _check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) - onedal_model = getattr(self, '_onedal_model', None) - n_features = getattr(self, 'n_features_in_', None) - n_samples_fit_ = getattr(self, 'n_samples_fit_', None) - shape = getattr(X, 'shape', None) + X = _check_array(X, accept_sparse="csr", dtype=[np.float64, np.float32]) + onedal_model = getattr(self, "_onedal_model", None) + n_features = getattr(self, "n_features_in_", None) + n_samples_fit_ = getattr(self, "n_samples_fit_", None) + shape = getattr(X, "shape", None) if n_features and shape and len(shape) > 1 and shape[1] != n_features: - raise ValueError((f'X has {X.shape[1]} features, ' - f'but KNNClassifier is expecting ' - f'{n_features} features as input')) + raise ValueError( + ( + f"X has {X.shape[1]} features, " + f"but KNNClassifier is expecting " + f"{n_features} features as input" + ) + ) _check_is_fitted(self) self._fit_method = super()._parse_auto_method( - self.algorithm, - n_samples_fit_, n_features) + self.algorithm, n_samples_fit_, n_features + ) params = self._get_onedal_params(X) @@ -649,21 +678,34 @@ def _predict_skl(self, X, queue=None): def predict(self, X, queue=None): gpu_device = queue is not None and queue.sycl_device.is_gpu - is_uniform_weights = getattr(self, 'weights', 'uniform') == 'uniform' - return self._predict_gpu(X, queue=queue) \ - if gpu_device and is_uniform_weights else self._predict_skl(X, queue=queue) + is_uniform_weights = getattr(self, "weights", "uniform") == "uniform" + return ( + self._predict_gpu(X, queue=queue) + if gpu_device and is_uniform_weights + else self._predict_skl(X, queue=queue) + ) class NearestNeighbors(NeighborsBase): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', - p=2, metric='minkowski', metric_params=None, **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + p=2, + metric="minkowski", + metric_params=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - metric=metric, p=p, + metric=metric, + p=p, metric_params=metric_params, - **kwargs) + **kwargs, + ) self.weights = weights def _get_onedal_params(self, X, y=None): @@ -672,16 +714,17 @@ def _get_onedal_params(self, X, y=None): def _get_daal_params(self, data): params = super()._get_daal_params(data) - params['resultsToCompute'] = 'computeIndicesOfNeighbors|computeDistances' - params['resultsToEvaluate'] = 'none' if getattr(self, '_y', None) is None \ - else 'computeClassLabels' + params["resultsToCompute"] = "computeIndicesOfNeighbors|computeDistances" + params["resultsToEvaluate"] = ( + "none" if getattr(self, "_y", None) is None else "computeClassLabels" + ) return params def _onedal_fit(self, X, y, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: + if self.effective_metric_ == "euclidean" and not gpu_device: params = self._get_daal_params(X) - if self._fit_method == 'brute': + if self._fit_method == "brute": train_alg = bf_knn_classification_training else: @@ -692,15 +735,14 @@ def _onedal_fit(self, X, y, queue): policy = self._get_policy(queue, X, y) X, y = _convert_to_supported(policy, X, y) params = self._get_onedal_params(X, y) - train_alg = _backend.neighbors.search.train(policy, params, - to_table(X)) + train_alg = _backend.neighbors.search.train(policy, params, to_table(X)) return train_alg.model def _onedal_predict(self, model, X, params, queue): gpu_device = queue is not None and queue.sycl_device.is_gpu - if self.effective_metric_ == 'euclidean' and not gpu_device: - if self._fit_method == 'brute': + if self.effective_metric_ == "euclidean" and not gpu_device: + if self._fit_method == "brute": predict_alg = bf_knn_classification_prediction else: @@ -710,12 +752,12 @@ def _onedal_predict(self, model, X, params, queue): policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(_backend.neighbors.search) - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + params["fptype"] = "float" if X.dtype == np.float32 else "double" result = _backend.neighbors.search.infer(policy, params, model, to_table(X)) return result @@ -723,6 +765,5 @@ def _onedal_predict(self, model, X, params, queue): def fit(self, X, y, queue=None): return super()._fit(X, y, queue=queue) - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super()._kneighbors(X, n_neighbors, return_distance, queue=queue) diff --git a/onedal/neighbors/tests/test_knn_classification.py b/onedal/neighbors/tests/test_knn_classification.py index c44e658e9d..8941f49965 100755 --- a/onedal/neighbors/tests/test_knn_classification.py +++ b/onedal/neighbors/tests/test_knn_classification.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_array_equal +from sklearn import datasets from onedal.neighbors import KNeighborsClassifier from onedal.tests.utils._device_selection import get_queues -from sklearn import datasets - -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): iris = datasets.load_iris() clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) @@ -33,13 +32,14 @@ def test_iris(queue): # TODO: investigate failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) +@pytest.mark.parametrize("queue", get_queues("cpu")) def test_pickle(queue): iris = datasets.load_iris() clf = KNeighborsClassifier(2).fit(iris.data, iris.target, queue=queue) expected = clf.predict(iris.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 3b1a72bffc..a409999030 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,13 +14,14 @@ # limitations under the License. # =============================================================================== -from .kernel_functions import linear_kernel, rbf_kernel, poly_kernel, sigmoid_kernel from .get_tree import get_tree_state_cls, get_tree_state_reg +from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ - 'get_tree_state_cls', - 'get_tree_state_reg', - 'linear_kernel', - 'rbf_kernel', - 'poly_kernel', - 'sigmoid_kernel'] + "get_tree_state_cls", + "get_tree_state_reg", + "linear_kernel", + "rbf_kernel", + "poly_kernel", + "sigmoid_kernel", +] diff --git a/onedal/primitives/get_tree.py b/onedal/primitives/get_tree.py index 2ba33e1e61..9afd86624b 100644 --- a/onedal/primitives/get_tree.py +++ b/onedal/primitives/get_tree.py @@ -18,8 +18,7 @@ def get_tree_state_cls(model, iTree, n_classes): - return _backend.get_tree.classification.get_tree_state( - model, iTree, n_classes) + return _backend.get_tree.classification.get_tree_state(model, iTree, n_classes) def get_tree_state_reg(model, iTree): diff --git a/onedal/primitives/kernel_functions.py b/onedal/primitives/kernel_functions.py index 5306d45684..aaa2eb3380 100644 --- a/onedal/primitives/kernel_functions.py +++ b/onedal/primitives/kernel_functions.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np + from onedal import _backend from ..common._policy import _get_policy -from ..datatypes import from_table, to_table, _convert_to_supported +from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import _check_array def _check_inputs(X, Y): def check_input(data): return _check_array(data, dtype=[np.float64, np.float32], force_all_finite=False) + X = check_input(X) Y = X if Y is None else check_input(Y) return X, Y @@ -33,7 +35,7 @@ def check_input(data): def _compute_kernel(params, submodule, X, Y, queue): policy = _get_policy(queue, X, Y) X, Y = _convert_to_supported(policy, X, Y) - params['fptype'] = 'float' if X.dtype == np.float32 else 'double' + params["fptype"] = "float" if X.dtype == np.float32 else "double" X, Y = to_table(X, Y) result = submodule.compute(policy, params, X, Y) return from_table(result.values) @@ -57,9 +59,13 @@ def linear_kernel(X, Y=None, scale=1.0, shift=0.0, queue=None): kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y) """ X, Y = _check_inputs(X, Y) - return _compute_kernel({'method': 'dense', - 'scale': scale, 'shift': shift}, - _backend.linear_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "scale": scale, "shift": shift}, + _backend.linear_kernel, + X, + Y, + queue, + ) def rbf_kernel(X, Y=None, gamma=None, queue=None): @@ -85,8 +91,9 @@ def rbf_kernel(X, Y=None, gamma=None, queue=None): gamma = 1.0 / X.shape[1] if gamma is None else gamma sigma = np.sqrt(0.5 / gamma) - return _compute_kernel({'method': 'dense', 'sigma': sigma}, - _backend.rbf_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "sigma": sigma}, _backend.rbf_kernel, X, Y, queue + ) def poly_kernel(X, Y=None, gamma=1.0, coef0=0.0, degree=3, queue=None): @@ -109,9 +116,13 @@ def poly_kernel(X, Y=None, gamma=1.0, coef0=0.0, degree=3, queue=None): """ X, Y = _check_inputs(X, Y) - return _compute_kernel({'method': 'dense', - 'scale': gamma, 'shift': coef0, 'degree': degree}, - _backend.polynomial_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "scale": gamma, "shift": coef0, "degree": degree}, + _backend.polynomial_kernel, + X, + Y, + queue, + ) def sigmoid_kernel(X, Y=None, gamma=1.0, coef0=0.0, queue=None): @@ -133,6 +144,10 @@ def sigmoid_kernel(X, Y=None, gamma=1.0, coef0=0.0, queue=None): """ X, Y = _check_inputs(X, Y) - return _compute_kernel({'method': 'dense', - 'scale': gamma, 'shift': coef0}, - _backend.sigmoid_kernel, X, Y, queue) + return _compute_kernel( + {"method": "dense", "scale": gamma, "shift": coef0}, + _backend.sigmoid_kernel, + X, + Y, + queue, + ) diff --git a/onedal/primitives/tests/test_kernel_functions.py b/onedal/primitives/tests/test_kernel_functions.py index e5682605c8..d8589a8e07 100644 --- a/onedal/primitives/tests/test_kernel_functions.py +++ b/onedal/primitives/tests/test_kernel_functions.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,21 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np +import pytest from numpy.testing import assert_allclose -from onedal.primitives import (linear_kernel, rbf_kernel, - poly_kernel, sigmoid_kernel) from sklearn.metrics.pairwise import rbf_kernel as sklearn_rbf_kernel -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.primitives import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) +@pytest.mark.parametrize("queue", get_queues("cpu")) def test_dense_self_linear_kernel(queue): rng = np.random.RandomState(0) X = np.array(5 * rng.random_sample((10, 4))) @@ -49,15 +50,15 @@ def _test_dense_small_linear_kernel(queue, scale, shift, dtype): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('scale', [1.0, 2.0]) -@pytest.mark.parametrize('shift', [0.0, 1.0]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("scale", [1.0, 2.0]) +@pytest.mark.parametrize("shift", [0.0, 1.0]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dense_small_linear_kernel(queue, scale, shift, dtype): _test_dense_small_linear_kernel(queue, scale, shift, dtype) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_self_rbf_kernel(queue): rng = np.random.RandomState(0) X = np.array(5 * rng.random_sample((10, 4))) @@ -80,15 +81,15 @@ def _test_dense_small_rbf_kernel(queue, gamma, dtype): assert_allclose(result, expected, rtol=tol) -@pytest.mark.parametrize('gamma', [0.1, None]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("gamma", [0.1, None]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_small_rbf_kernel(queue, gamma, dtype): _test_dense_small_rbf_kernel(queue, gamma, dtype) @pass_if_not_implemented_for_gpu(reason="poly kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_self_poly_kernel(queue): rng = np.random.RandomState(0) X = np.array(2 * rng.random_sample((10, 4))) @@ -113,17 +114,17 @@ def _test_dense_small_poly_kernel(queue, gamma, coef0, degree, dtype): @pass_if_not_implemented_for_gpu(reason="poly kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', [0.1, 1.0]) -@pytest.mark.parametrize('coef0', [0.0, 1.0]) -@pytest.mark.parametrize('degree', [2, 3]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", [0.1, 1.0]) +@pytest.mark.parametrize("coef0", [0.0, 1.0]) +@pytest.mark.parametrize("degree", [2, 3]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dense_small_poly_kernel(queue, gamma, coef0, degree, dtype): _test_dense_small_poly_kernel(queue, gamma, coef0, degree, dtype) @pass_if_not_implemented_for_gpu(reason="sigmoid kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_dense_self_sigmoid_kernel(queue): rng = np.random.RandomState(0) X = np.array(2 * rng.random_sample((15, 4))) @@ -147,9 +148,9 @@ def _test_dense_small_sigmoid_kernel(queue, gamma, coef0, dtype): @pass_if_not_implemented_for_gpu(reason="sigmoid kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', [0.1, 1.0, 2.4]) -@pytest.mark.parametrize('coef0', [0.0, 1.0, 5.5]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", [0.1, 1.0, 2.4]) +@pytest.mark.parametrize("coef0", [0.0, 1.0, 5.5]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dense_small_sigmoid_kernel(queue, gamma, coef0, dtype): _test_dense_small_sigmoid_kernel(queue, gamma, coef0, dtype) diff --git a/onedal/spmd/__init__.py b/onedal/spmd/__init__.py index 9099df571a..3c698d694b 100644 --- a/onedal/spmd/__init__.py +++ b/onedal/spmd/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== __all__ = [ - 'basic_statistics', - 'cluster', - 'decomposition', - 'ensemble', - 'linear_model', - 'neighbors'] + "basic_statistics", + "cluster", + "decomposition", + "ensemble", + "linear_model", + "neighbors", +] diff --git a/onedal/spmd/basic_statistics/__init__.py b/onedal/spmd/basic_statistics/__init__.py index 6f45ecfe5c..2b99fdbdb7 100644 --- a/onedal/spmd/basic_statistics/__init__.py +++ b/onedal/spmd/basic_statistics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/onedal/spmd/basic_statistics/basic_statistics.py b/onedal/spmd/basic_statistics/basic_statistics.py index af4a5e2429..86269277d9 100644 --- a/onedal/spmd/basic_statistics/basic_statistics.py +++ b/onedal/spmd/basic_statistics/basic_statistics.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.basic_statistics import BasicStatistics as BasicStatistics_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BaseBasicStatisticsSPMD(ABC): def _get_policy(self, queue, *data): @@ -26,7 +28,6 @@ def _get_policy(self, queue, *data): class BasicStatistics(BaseBasicStatisticsSPMD, BasicStatistics_Batch): - @support_usm_ndarray() def compute(self, data, weights=None, queue=None): return super().compute(data, weights, queue) diff --git a/onedal/spmd/cluster/__init__.py b/onedal/spmd/cluster/__init__.py index b94f1d3918..0c39935dc2 100644 --- a/onedal/spmd/cluster/__init__.py +++ b/onedal/spmd/cluster/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans import KMeans - __all__ = ['KMeans'] + __all__ = ["KMeans"] else: __all__ = [] diff --git a/onedal/spmd/cluster/kmeans.py b/onedal/spmd/cluster/kmeans.py index 000d265af1..abab681554 100644 --- a/onedal/spmd/cluster/kmeans.py +++ b/onedal/spmd/cluster/kmeans.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.cluster import KMeans as KMeans_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BaseKMeansSPMD(ABC): def _get_policy(self, queue, *data): diff --git a/onedal/spmd/decomposition/__init__.py b/onedal/spmd/decomposition/__init__.py index eda7b9fc14..618e0b9082 100644 --- a/onedal/spmd/decomposition/__init__.py +++ b/onedal/spmd/decomposition/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/onedal/spmd/decomposition/pca.py b/onedal/spmd/decomposition/pca.py index a511170ec2..e150cf8e63 100644 --- a/onedal/spmd/decomposition/pca.py +++ b/onedal/spmd/decomposition/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray from onedal.decomposition.pca import PCA as PCABatch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BasePCASPMD: def _get_policy(self, queue, *data): @@ -26,7 +27,6 @@ def _get_policy(self, queue, *data): class PCA(BasePCASPMD, PCABatch): - @support_usm_ndarray() def fit(self, X, queue): return super().fit(X, queue) diff --git a/onedal/spmd/ensemble/__init__.py b/onedal/spmd/ensemble/__init__.py index 5dcc919355..b53fb8f910 100644 --- a/onedal/spmd/ensemble/__init__.py +++ b/onedal/spmd/ensemble/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor'] +__all__ = ["RandomForestClassifier", "RandomForestRegressor"] diff --git a/onedal/spmd/ensemble/forest.py b/onedal/spmd/ensemble/forest.py index d2e32be87c..56d18a2a0f 100644 --- a/onedal/spmd/ensemble/forest.py +++ b/onedal/spmd/ensemble/forest.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,15 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy - from onedal.ensemble import RandomForestClassifier as RandomForestClassifier_Batch from onedal.ensemble import RandomForestRegressor as RandomForestRegressor_Batch +from ...common._spmd_policy import _get_spmd_policy + class BaseForestSPMD(ABC): def _get_policy(self, queue, *data): diff --git a/onedal/spmd/linear_model/__init__.py b/onedal/spmd/linear_model/__init__.py index 33e882bdcb..893243cd56 100644 --- a/onedal/spmd/linear_model/__init__.py +++ b/onedal/spmd/linear_model/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear_model import LinearRegression -__all__ = ['LinearRegression'] +__all__ = ["LinearRegression"] diff --git a/onedal/spmd/linear_model/linear_model.py b/onedal/spmd/linear_model/linear_model.py index d07eb7df28..8990a3b1c9 100644 --- a/onedal/spmd/linear_model/linear_model.py +++ b/onedal/spmd/linear_model/linear_model.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.linear_model import LinearRegression as LinearRegression_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class BaseLinearRegressionSPMD(ABC): def _get_policy(self, queue, *data): @@ -26,7 +28,6 @@ def _get_policy(self, queue, *data): class LinearRegression(BaseLinearRegressionSPMD, LinearRegression_Batch): - @support_usm_ndarray() def fit(self, X, y, queue=None): return super().fit(X, y, queue) diff --git a/onedal/spmd/neighbors/__init__.py b/onedal/spmd/neighbors/__init__.py index 99099fa51c..11f104287a 100644 --- a/onedal/spmd/neighbors/__init__.py +++ b/onedal/spmd/neighbors/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'NearestNeighbors'] +__all__ = ["KNeighborsClassifier", "KNeighborsRegressor", "NearestNeighbors"] diff --git a/onedal/spmd/neighbors/neighbors.py b/onedal/spmd/neighbors/neighbors.py index 02981599b9..d3b7cb61c8 100644 --- a/onedal/spmd/neighbors/neighbors.py +++ b/onedal/spmd/neighbors/neighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -from ...common._spmd_policy import _get_spmd_policy -from ..._device_offload import support_usm_ndarray + from onedal.neighbors import KNeighborsClassifier as KNeighborsClassifier_Batch from onedal.neighbors import KNeighborsRegressor as KNeighborsRegressor_Batch +from ..._device_offload import support_usm_ndarray +from ...common._spmd_policy import _get_spmd_policy + class NeighborsCommonBaseSPMD(ABC): def _get_policy(self, queue, *data): @@ -27,7 +29,6 @@ def _get_policy(self, queue, *data): class KNeighborsClassifier(NeighborsCommonBaseSPMD, KNeighborsClassifier_Batch): - @support_usm_ndarray() def fit(self, X, y, queue=None): return super().fit(X, y, queue) @@ -41,8 +42,7 @@ def predict_proba(self, X, queue=None): raise NotImplementedError("predict_proba not supported in distributed mode.") @support_usm_ndarray() - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super().kneighbors(X, n_neighbors, return_distance, queue) @@ -52,12 +52,13 @@ def fit(self, X, y, queue=None): if queue is not None and queue.sycl_device.is_gpu: return super()._fit(X, y, queue=queue) else: - raise ValueError('SPMD version of kNN is not implemented for ' - 'CPU. Consider running on it on GPU.') + raise ValueError( + "SPMD version of kNN is not implemented for " + "CPU. Consider running on it on GPU." + ) @support_usm_ndarray() - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super().kneighbors(X, n_neighbors, return_distance, queue) @support_usm_ndarray() @@ -66,18 +67,16 @@ def predict(self, X, queue=None): def _get_onedal_params(self, X, y=None): params = super()._get_onedal_params(X, y) - if 'responses' not in params['result_option']: - params['result_option'] += '|responses' + if "responses" not in params["result_option"]: + params["result_option"] += "|responses" return params class NearestNeighbors(NeighborsCommonBaseSPMD): - @support_usm_ndarray() def fit(self, X, y, queue=None): return super().fit(X, y, queue) @support_usm_ndarray() - def kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, queue=None): return super().kneighbors(X, n_neighbors, return_distance, queue) diff --git a/onedal/svm/__init__.py b/onedal/svm/__init__.py index c8647cba2a..941048029b 100644 --- a/onedal/svm/__init__.py +++ b/onedal/svm/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .svm import SVC, SVR, NuSVC, NuSVR, SVMtype -__all__ = ['SVC', 'SVR', 'NuSVC', 'NuSVR', 'SVMtype'] +__all__ = ["SVC", "SVR", "NuSVC", "NuSVR", "SVMtype"] diff --git a/onedal/svm/svm.py b/onedal/svm/svm.py index 6997d41ec7..b851d0178c 100644 --- a/onedal/svm/svm.py +++ b/onedal/svm/svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import sklearn_check_version -from sklearn.base import BaseEstimator from abc import ABCMeta, abstractmethod from enum import Enum from numbers import Number, Real import numpy as np from scipy import sparse as sp +from sklearn.base import BaseEstimator + +from daal4py.sklearn._utils import sklearn_check_version +from onedal import _backend +from ..common._estimator_checks import _check_is_fitted from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted -from ..datatypes import ( - from_table, - to_table) +from ..datatypes import from_table, to_table from ..utils import ( - _validate_targets, - _check_X_y, _check_array, + _check_n_features, + _check_X_y, _column_or_1d, - _check_n_features + _validate_targets, ) -from onedal import _backend class SVMtype(Enum): @@ -48,11 +47,28 @@ class SVMtype(Enum): class BaseSVM(BaseEstimator, metaclass=ABCMeta): @abstractmethod - def __init__(self, C, nu, epsilon, kernel='rbf', *, degree, gamma, - coef0, tol, shrinking, cache_size, max_iter, tau, - class_weight, decision_function_shape, - break_ties, algorithm, svm_type=None, **kwargs): - + def __init__( + self, + C, + nu, + epsilon, + kernel="rbf", + *, + degree, + gamma, + coef0, + tol, + shrinking, + cache_size, + max_iter, + tau, + class_weight, + decision_function_shape, + break_ties, + algorithm, + svm_type=None, + **kwargs, + ): self.C = C self.nu = nu self.epsilon = epsilon @@ -73,14 +89,14 @@ def __init__(self, C, nu, epsilon, kernel='rbf', *, degree, gamma, def _compute_gamma_sigma(self, gamma, X): if isinstance(gamma, str): - if gamma == 'scale': + if gamma == "scale": if sp.isspmatrix(X): # var = E[X^2] - E[X]^2 - X_sc = (X.multiply(X)).mean() - (X.mean())**2 + X_sc = (X.multiply(X)).mean() - (X.mean()) ** 2 else: X_sc = X.var() _gamma = 1.0 / (X.shape[1] * X_sc) if X_sc != 0 else 1.0 - elif gamma == 'auto': + elif gamma == "auto": _gamma = 1.0 / X.shape[1] else: raise ValueError( @@ -88,7 +104,7 @@ def _compute_gamma_sigma(self, gamma, X): "'auto'. Got '{}' instead.".format(gamma) ) else: - if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): if isinstance(gamma, Real): if gamma <= 0: msg = ( @@ -119,17 +135,19 @@ def _get_sample_weight(self, X, y, sample_weight): if n_samples == 1: raise ValueError("n_samples=1") - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=np.float64) + sample_weight = np.asarray( + [] if sample_weight is None else sample_weight, dtype=np.float64 + ) sample_weight_count = sample_weight.shape[0] if sample_weight_count != 0 and sample_weight_count != n_samples: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (len(sample_weight), X.shape)) + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (len(sample_weight), X.shape) + ) ww = None if sample_weight_count == 0 and self.class_weight_ is None: @@ -141,40 +159,51 @@ def _get_sample_weight(self, X, y, sample_weight): sample_weight = np.full(n_samples, sample_weight, dtype=dtype) else: sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" + sample_weight, + accept_sparse=False, + ensure_2d=False, + dtype=dtype, + order="C", ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) if self.svm_type == SVMtype.nu_svc: - weight_per_class = [np.sum(sample_weight[y == class_label]) - for class_label in np.unique(y)] + weight_per_class = [ + np.sum(sample_weight[y == class_label]) for class_label in np.unique(y) + ] for i in range(len(weight_per_class)): for j in range(i + 1, len(weight_per_class)): - if self.nu * (weight_per_class[i] + weight_per_class[j]) / 2 > \ - min(weight_per_class[i], weight_per_class[j]): - raise ValueError('specified nu is infeasible') + if self.nu * (weight_per_class[i] + weight_per_class[j]) / 2 > min( + weight_per_class[i], weight_per_class[j] + ): + raise ValueError("specified nu is infeasible") if np.all(sample_weight <= 0): if self.svm_type == SVMtype.nu_svc: - err_msg = 'negative dimensions are not allowed' + err_msg = "negative dimensions are not allowed" else: - err_msg = 'Invalid input - all samples have zero or negative weights.' + err_msg = "Invalid input - all samples have zero or negative weights." raise ValueError(err_msg) if np.any(sample_weight <= 0): - if self.svm_type == SVMtype.c_svc and \ - len(np.unique(y[sample_weight > 0])) != len(self.classes_): + if self.svm_type == SVMtype.c_svc and len( + np.unique(y[sample_weight > 0]) + ) != len(self.classes_): raise ValueError( - 'Invalid input - all samples with positive weights ' - 'belong to the same class' if sklearn_check_version('1.2') else - 'Invalid input - all samples with positive weights ' - 'have the same label.') + "Invalid input - all samples with positive weights " + "belong to the same class" + if sklearn_check_version("1.2") + else "Invalid input - all samples with positive weights " + "have the same label." + ) ww = sample_weight if self.class_weight_ is not None: for i, v in enumerate(self.class_weight_): @@ -192,39 +221,51 @@ def _get_onedal_params(self, data): self.n_iter_ = 1 if max_iter < 1 else max_iter class_count = 0 if self.classes_ is None else len(self.classes_) return { - 'fptype': 'float' if data.dtype == np.float32 else 'double', - 'method': self.algorithm, - 'kernel': self.kernel, - 'c': self.C, 'nu': self.nu, 'epsilon': self.epsilon, - 'class_count': class_count, 'accuracy_threshold': self.tol, - 'max_iteration_count': int(max_iter), 'scale': self._scale_, - 'sigma': self._sigma_, 'shift': self.coef0, 'degree': self.degree, - 'tau': self.tau, 'shrinking': self.shrinking, 'cache_size': self.cache_size + "fptype": "float" if data.dtype == np.float32 else "double", + "method": self.algorithm, + "kernel": self.kernel, + "c": self.C, + "nu": self.nu, + "epsilon": self.epsilon, + "class_count": class_count, + "accuracy_threshold": self.tol, + "max_iteration_count": int(max_iter), + "scale": self._scale_, + "sigma": self._sigma_, + "shift": self.coef0, + "degree": self.degree, + "tau": self.tau, + "shrinking": self.shrinking, + "cache_size": self.cache_size, } def _fit(self, X, y, sample_weight, module, queue): - if hasattr(self, 'decision_function_shape'): - if self.decision_function_shape not in ('ovr', 'ovo', None): + if hasattr(self, "decision_function_shape"): + if self.decision_function_shape not in ("ovr", "ovo", None): raise ValueError( f"decision_function_shape must be either 'ovr' or 'ovo', " f"got {self.decision_function_shape}." ) if y is None: - if self._get_tags()['requires_y']: + if self._get_tags()["requires_y"]: raise ValueError( f"This {self.__class__.__name__} estimator " f"requires y to be passed, but the target y is None." ) X, y = _check_X_y( - X, y, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse='csr') + X, + y, + dtype=[np.float64, np.float32], + force_all_finite=True, + accept_sparse="csr", + ) y = self._validate_targets(y, X.dtype) sample_weight = self._get_sample_weight(X, y, sample_weight) self._sparse = sp.isspmatrix(X) - if self.kernel == 'linear': + if self.kernel == "linear": self._scale_, self._sigma_ = 1.0, 1.0 self.coef0 = 0.0 else: @@ -242,14 +283,15 @@ def _fit(self, X, y, sample_weight, module, queue): self.support_vectors_ = from_table(result.support_vectors) self.intercept_ = from_table(result.biases).ravel() - self.support_ = from_table(result.support_indices).ravel().astype('int') + self.support_ = from_table(result.support_indices).ravel().astype("int") self.n_features_in_ = X.shape[1] self.shape_fit_ = X.shape - if getattr(self, 'classes_', None) is not None: + if getattr(self, "classes_", None) is not None: indices = y.take(self.support_, axis=0) - self._n_support = np.array([ - np.sum(indices == i) for i, _ in enumerate(self.classes_)]) + self._n_support = np.array( + [np.sum(indices == i) for i, _ in enumerate(self.classes_)] + ) self._gamma = self._scale_ self._onedal_model = result.model @@ -268,22 +310,32 @@ def _create_model(self, module): def _predict(self, X, module, queue): _check_is_fitted(self) - if self.break_ties and self.decision_function_shape == 'ovo': - raise ValueError("break_ties must be False when " - "decision_function_shape is 'ovo'") + if self.break_ties and self.decision_function_shape == "ovo": + raise ValueError( + "break_ties must be False when " "decision_function_shape is 'ovo'" + ) - if (module in [_backend.svm.classification, _backend.svm.nu_classification]): + if module in [_backend.svm.classification, _backend.svm.nu_classification]: sv = self.support_vectors_ if not self._sparse and sv.size > 0 and self._n_support.sum() != sv.shape[0]: - raise ValueError("The internal representation " - f"of {self.__class__.__name__} was altered") + raise ValueError( + "The internal representation " + f"of {self.__class__.__name__} was altered" + ) - if self.break_ties and self.decision_function_shape == 'ovr' and \ - len(self.classes_) > 2: + if ( + self.break_ties + and self.decision_function_shape == "ovr" + and len(self.classes_) > 2 + ): y = np.argmax(self.decision_function(X), axis=1) else: - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse='csr') + X = _check_array( + X, + dtype=[np.float64, np.float32], + force_all_finite=True, + accept_sparse="csr", + ) _check_n_features(self, X, False) if self._sparse and not sp.isspmatrix(X): @@ -294,12 +346,13 @@ def _predict(self, X, module, queue): if sp.issparse(X) and not self._sparse and not callable(self.kernel): raise ValueError( "cannot use sparse input in %r trained on dense data" - % type(self).__name__) + % type(self).__name__ + ) policy = _get_policy(queue, X) params = self._get_onedal_params(X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(module) @@ -321,14 +374,16 @@ def _ovr_decision_function(self, predictions, confidences, n_classes): votes[predictions[:, k] == 1, j] += 1 k += 1 - transformed_confidences = \ - sum_of_confidences / (3 * (np.abs(sum_of_confidences) + 1)) + transformed_confidences = sum_of_confidences / ( + 3 * (np.abs(sum_of_confidences) + 1) + ) return votes + transformed_confidences def _decision_function(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=False, accept_sparse='csr') + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=False, accept_sparse="csr" + ) _check_n_features(self, X, False) if self._sparse and not sp.isspmatrix(X): @@ -339,18 +394,21 @@ def _decision_function(self, X, module, queue): if sp.issparse(X) and not self._sparse and not callable(self.kernel): raise ValueError( "cannot use sparse input in %r trained on dense data" - % type(self).__name__) + % type(self).__name__ + ) - if (module in [_backend.svm.classification, _backend.svm.nu_classification]): + if module in [_backend.svm.classification, _backend.svm.nu_classification]: sv = self.support_vectors_ if not self._sparse and sv.size > 0 and self._n_support.sum() != sv.shape[0]: - raise ValueError("The internal representation " - f"of {self.__class__.__name__} was altered") + raise ValueError( + "The internal representation " + f"of {self.__class__.__name__} was altered" + ) policy = _get_policy(queue, X) params = self._get_onedal_params(X) - if hasattr(self, '_onedal_model'): + if hasattr(self, "_onedal_model"): model = self._onedal_model else: model = self._create_model(module) @@ -360,9 +418,10 @@ def _decision_function(self, X, module, queue): if len(self.classes_) == 2: decision_function = decision_function.ravel() - if self.decision_function_shape == 'ovr' and len(self.classes_) > 2: + if self.decision_function_shape == "ovr" and len(self.classes_) > 2: decision_function = self._ovr_decision_function( - decision_function < 0, -decision_function, len(self.classes_)) + decision_function < 0, -decision_function, len(self.classes_) + ) return decision_function @@ -371,17 +430,41 @@ class SVR(RegressorMixin, BaseSVM): Epsilon--Support Vector Regression. """ - def __init__(self, C=1.0, epsilon=0.1, kernel='rbf', *, degree=3, - gamma='scale', coef0=0.0, tol=1e-3, shrinking=True, - cache_size=200.0, max_iter=-1, tau=1e-12, - algorithm='thunder', **kwargs): - super().__init__(C=C, nu=0.5, epsilon=epsilon, kernel=kernel, - degree=degree, gamma=gamma, - coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=None, - decision_function_shape=None, - break_ties=False, algorithm=algorithm) + def __init__( + self, + C=1.0, + epsilon=0.1, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=C, + nu=0.5, + epsilon=epsilon, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=None, + decision_function_shape=None, + break_ties=False, + algorithm=algorithm, + ) self.svm_type = SVMtype.epsilon_svr def fit(self, X, y, sample_weight=None, queue=None): @@ -397,22 +480,49 @@ class SVC(ClassifierMixin, BaseSVM): C-Support Vector Classification. """ - def __init__(self, C=1.0, kernel='rbf', *, degree=3, gamma='scale', - coef0=0.0, tol=1e-3, shrinking=True, cache_size=200.0, - max_iter=-1, tau=1e-12, class_weight=None, - decision_function_shape='ovr', break_ties=False, - algorithm='thunder', **kwargs): - super().__init__(C=C, nu=0.5, epsilon=0.0, kernel=kernel, degree=degree, - gamma=gamma, coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=class_weight, - decision_function_shape=decision_function_shape, - break_ties=break_ties, algorithm=algorithm) + def __init__( + self, + C=1.0, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + class_weight=None, + decision_function_shape="ovr", + break_ties=False, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=C, + nu=0.5, + epsilon=0.0, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=class_weight, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + algorithm=algorithm, + ) self.svm_type = SVMtype.c_svc def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) return y def fit(self, X, y, sample_weight=None, queue=None): @@ -433,17 +543,41 @@ class NuSVR(RegressorMixin, BaseSVM): Nu-Support Vector Regression. """ - def __init__(self, nu=0.5, C=1.0, kernel='rbf', *, degree=3, - gamma='scale', coef0=0.0, tol=1e-3, shrinking=True, - cache_size=200.0, max_iter=-1, tau=1e-12, - algorithm='thunder', **kwargs): - super().__init__(C=C, nu=nu, epsilon=0.0, kernel=kernel, - degree=degree, gamma=gamma, - coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=None, - decision_function_shape=None, - break_ties=False, algorithm=algorithm) + def __init__( + self, + nu=0.5, + C=1.0, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=C, + nu=nu, + epsilon=0.0, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=None, + decision_function_shape=None, + break_ties=False, + algorithm=algorithm, + ) self.svm_type = SVMtype.nu_svr def fit(self, X, y, sample_weight=None, queue=None): @@ -459,22 +593,49 @@ class NuSVC(ClassifierMixin, BaseSVM): Nu-Support Vector Classification. """ - def __init__(self, nu=0.5, kernel='rbf', *, degree=3, gamma='scale', - coef0=0.0, tol=1e-3, shrinking=True, cache_size=200.0, - max_iter=-1, tau=1e-12, class_weight=None, - decision_function_shape='ovr', break_ties=False, - algorithm='thunder', **kwargs): - super().__init__(C=1.0, nu=nu, epsilon=0.0, kernel=kernel, degree=degree, - gamma=gamma, coef0=coef0, tol=tol, - shrinking=shrinking, cache_size=cache_size, - max_iter=max_iter, tau=tau, class_weight=class_weight, - decision_function_shape=decision_function_shape, - break_ties=break_ties, algorithm=algorithm) + def __init__( + self, + nu=0.5, + kernel="rbf", + *, + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + shrinking=True, + cache_size=200.0, + max_iter=-1, + tau=1e-12, + class_weight=None, + decision_function_shape="ovr", + break_ties=False, + algorithm="thunder", + **kwargs, + ): + super().__init__( + C=1.0, + nu=nu, + epsilon=0.0, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + shrinking=shrinking, + cache_size=cache_size, + max_iter=max_iter, + tau=tau, + class_weight=class_weight, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + algorithm=algorithm, + ) self.svm_type = SVMtype.nu_svc def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) return y def fit(self, X, y, sample_weight=None, queue=None): diff --git a/onedal/svm/tests/test_csr_svm.py b/onedal/svm/tests/test_csr_svm.py index a623e9c2ea..a1f445868e 100644 --- a/onedal/svm/tests/test_csr_svm.py +++ b/onedal/svm/tests/test_csr_svm.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,26 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from scipy import sparse as sp - -from numpy.testing import assert_array_equal, assert_array_almost_equal - -from onedal.svm import SVC, SVR - -from sklearn.utils.estimator_checks import check_estimator +import pytest import sklearn.utils.estimator_checks +from numpy.testing import assert_array_almost_equal, assert_array_equal +from scipy import sparse as sp from sklearn import datasets, metrics +from sklearn.base import clone as clone_estimator +from sklearn.datasets import make_blobs, make_classification from sklearn.metrics.pairwise import rbf_kernel -from sklearn.datasets import make_classification, make_blobs from sklearn.model_selection import train_test_split -from sklearn.base import clone as clone_estimator +from sklearn.utils.estimator_checks import check_estimator -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import SVC, SVR +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) def is_classifier(estimator): @@ -49,18 +48,24 @@ def check_svm_model_equal(queue, svm, X_train, y_train, X_test, decimal=6): sparse_svm.fit(X_train, y_train, queue=queue) assert sp.issparse(sparse_svm.support_vectors_) assert sp.issparse(sparse_svm.dual_coef_) - assert_array_almost_equal(dense_svm.support_vectors_, - sparse_svm.support_vectors_.toarray(), decimal) - assert_array_almost_equal(dense_svm.dual_coef_, - sparse_svm.dual_coef_.toarray(), decimal) + assert_array_almost_equal( + dense_svm.support_vectors_, sparse_svm.support_vectors_.toarray(), decimal + ) + assert_array_almost_equal( + dense_svm.dual_coef_, sparse_svm.dual_coef_.toarray(), decimal + ) assert_array_almost_equal(dense_svm.support_, sparse_svm.support_) - assert_array_almost_equal(dense_svm.predict(X_test_dense, queue=queue), - sparse_svm.predict(X_test, queue=queue)) + assert_array_almost_equal( + dense_svm.predict(X_test_dense, queue=queue), + sparse_svm.predict(X_test, queue=queue), + ) if is_classifier(svm): - assert_array_almost_equal(dense_svm.decision_function(X_test_dense, queue=queue), - sparse_svm.decision_function(X_test, queue=queue), - decimal) + assert_array_almost_equal( + dense_svm.decision_function(X_test_dense, queue=queue), + sparse_svm.decision_function(X_test, queue=queue), + decimal, + ) def _test_simple_dataset(queue, kernel): @@ -77,12 +82,20 @@ def _test_simple_dataset(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail( - reason="raises UnknownError instead of RuntimeError " - "with unimplemented message"))]) -@pytest.mark.parametrize('kernel', ['linear', 'rbf']) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="raises UnknownError instead of RuntimeError " + "with unimplemented message" + ), + ) + ], +) +@pytest.mark.parametrize("kernel", ["linear", "rbf"]) def test_simple_dataset(queue, kernel): _test_simple_dataset(queue, kernel) @@ -97,13 +110,21 @@ def _test_binary_dataset(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail( - reason="raises UnknownError for linear and rbf, " - "Unimplemented error with inconsistent error message " - "for poly and sigmoid"))]) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="raises UnknownError for linear and rbf, " + "Unimplemented error with inconsistent error message " + "for poly and sigmoid" + ), + ) + ], +) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_binary_dataset(queue, kernel): _test_binary_dataset(queue, kernel) @@ -123,8 +144,8 @@ def _test_iris(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_iris(queue, kernel): _test_iris(queue, kernel) @@ -140,35 +161,191 @@ def _test_diabetes(queue, kernel): @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_diabetes(queue, kernel): _test_diabetes(queue, kernel) @pass_if_not_implemented_for_gpu(reason="csr svm is not implemented") @pytest.mark.xfail(reason="Failed test. Need investigate") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_sparse_realdata(queue): data = np.array([0.03771744, 0.1003567, 0.01174647, 0.027069]) indices = np.array([6, 5, 35, 31]) indptr = np.array( - [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4]) + [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + ] + ) X = sp.csr_matrix((data, indices, indptr)) y = np.array( - [1., 0., 2., 2., 1., 1., 1., 2., 2., 0., 1., 2., 2., - 0., 2., 0., 3., 0., 3., 0., 1., 1., 3., 2., 3., 2., - 0., 3., 1., 0., 2., 1., 2., 0., 1., 0., 2., 3., 1., - 3., 0., 1., 0., 0., 2., 0., 1., 2., 2., 2., 3., 2., - 0., 3., 2., 1., 2., 3., 2., 2., 0., 1., 0., 1., 2., - 3., 0., 0., 2., 2., 1., 3., 1., 1., 0., 1., 2., 1., - 1., 3.]) - - clf = SVC(kernel='linear').fit(X.toarray(), y, queue=queue) - sp_clf = SVC(kernel='linear').fit(X, y, queue=queue) + [ + 1.0, + 0.0, + 2.0, + 2.0, + 1.0, + 1.0, + 1.0, + 2.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 0.0, + 2.0, + 0.0, + 3.0, + 0.0, + 3.0, + 0.0, + 1.0, + 1.0, + 3.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 1.0, + 0.0, + 2.0, + 1.0, + 2.0, + 0.0, + 1.0, + 0.0, + 2.0, + 3.0, + 1.0, + 3.0, + 0.0, + 1.0, + 0.0, + 0.0, + 2.0, + 0.0, + 1.0, + 2.0, + 2.0, + 2.0, + 3.0, + 2.0, + 0.0, + 3.0, + 2.0, + 1.0, + 2.0, + 3.0, + 2.0, + 2.0, + 0.0, + 1.0, + 0.0, + 1.0, + 2.0, + 3.0, + 0.0, + 0.0, + 2.0, + 2.0, + 1.0, + 3.0, + 1.0, + 1.0, + 0.0, + 1.0, + 2.0, + 1.0, + 1.0, + 3.0, + ] + ) + + clf = SVC(kernel="linear").fit(X.toarray(), y, queue=queue) + sp_clf = SVC(kernel="linear").fit(X, y, queue=queue) assert_array_equal(clf.support_vectors_, sp_clf.support_vectors_.toarray()) assert_array_equal(clf.dual_coef_, sp_clf.dual_coef_.toarray()) diff --git a/onedal/svm/tests/test_nusvc.py b/onedal/svm/tests/test_nusvc.py index 68fc0c0390..4fa1d83ddf 100644 --- a/onedal/svm/tests/test_nusvc.py +++ b/onedal/svm/tests/test_nusvc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,48 +12,48 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal - -from onedal.svm import NuSVC -from sklearn.svm import NuSVC as SklearnNuSVC - +import pytest +from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn import datasets from sklearn.datasets import make_blobs from sklearn.metrics.pairwise import rbf_kernel from sklearn.model_selection import train_test_split +from sklearn.svm import NuSVC as SklearnNuSVC -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import NuSVC +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) def _test_libsvm_parameters(queue, array_constr, dtype): - X = array_constr([[-2, -1], [-1, -1], [-1, -2], - [1, 1], [1, 2], [2, 1]], dtype=dtype) + X = array_constr([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=dtype) y = array_constr([1, 1, 1, 2, 2, 2], dtype=dtype) - clf = NuSVC(kernel='linear').fit(X, y, queue=queue) + clf = NuSVC(kernel="linear").fit(X, y, queue=queue) assert_array_almost_equal( - clf.dual_coef_, [[-0.04761905, -0.0952381, 0.0952381, 0.04761905]]) + clf.dual_coef_, [[-0.04761905, -0.0952381, 0.0952381, 0.04761905]] + ) assert_array_equal(clf.support_, [0, 1, 3, 4]) assert_array_equal(clf.support_vectors_, X[clf.support_]) - assert_array_equal(clf.intercept_, [0.]) + assert_array_equal(clf.intercept_, [0.0]) assert_array_equal(clf.predict(X, queue=queue), y) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('array_constr', [np.array]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("array_constr", [np.array]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_libsvm_parameters(queue, array_constr, dtype): _test_libsvm_parameters(queue, array_constr, dtype) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_class_weight(queue): X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) y = np.array([1, 1, 1, 2, 2, 2]) @@ -64,23 +64,23 @@ def test_class_weight(queue): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_sample_weight(queue): X = np.array([[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) - clf = NuSVC(kernel='linear') + clf = NuSVC(kernel="linear") clf.fit(X, y, sample_weight=[1] * 6, queue=queue) assert_array_almost_equal(clf.intercept_, [0.0]) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function(queue): X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] Y = [1, 1, 1, 2, 2, 2] - clf = NuSVC(kernel='rbf', gamma=1, decision_function_shape='ovo') + clf = NuSVC(kernel="rbf", gamma=1, decision_function_shape="ovo") clf.fit(X, Y, queue=queue) rbfs = rbf_kernel(X, clf.support_vectors_, gamma=clf.gamma) @@ -89,23 +89,24 @@ def test_decision_function(queue): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): iris = datasets.load_iris() - clf = NuSVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = NuSVC(kernel="linear").fit(iris.data, iris.target, queue=queue) assert clf.score(iris.data, iris.target, queue=queue) > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function_shape(queue): X, y = make_blobs(n_samples=80, centers=5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # check shape of ovo_decition_function=True - clf = NuSVC(kernel='linear', - decision_function_shape='ovo').fit(X_train, y_train, queue=queue) + clf = NuSVC(kernel="linear", decision_function_shape="ovo").fit( + X_train, y_train, queue=queue + ) dec = clf.decision_function(X_train, queue=queue) assert dec.shape == (len(X_train), 10) @@ -114,13 +115,14 @@ def test_decision_function_shape(queue): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): iris = datasets.load_iris() - clf = NuSVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = NuSVC(kernel="linear").fit(iris.data, iris.target, queue=queue) expected = clf.decision_function(iris.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) @@ -132,11 +134,11 @@ def test_pickle(queue): def _test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): cancer = datasets.load_breast_cancer() - clf = NuSVC(kernel='rbf', gamma=gamma, nu=nu) + clf = NuSVC(kernel="rbf", gamma=gamma, nu=nu) clf.fit(cancer.data, cancer.target, queue=queue) result = clf.score(cancer.data, cancer.target, queue=queue) - clf = SklearnNuSVC(kernel='rbf', gamma=gamma, nu=nu) + clf = SklearnNuSVC(kernel="rbf", gamma=gamma, nu=nu) clf.fit(cancer.data, cancer.target) expected = clf.score(cancer.data, cancer.target) @@ -145,9 +147,9 @@ def _test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', ['scale', 'auto']) -@pytest.mark.parametrize('nu', [0.25, 0.5]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", ["scale", "auto"]) +@pytest.mark.parametrize("nu", [0.25, 0.5]) def test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): _test_cancer_rbf_compare_with_sklearn(queue, nu, gamma) @@ -155,11 +157,11 @@ def test_cancer_rbf_compare_with_sklearn(queue, nu, gamma): def _test_cancer_linear_compare_with_sklearn(queue, nu): cancer = datasets.load_breast_cancer() - clf = NuSVC(kernel='linear', nu=nu) + clf = NuSVC(kernel="linear", nu=nu) clf.fit(cancer.data, cancer.target, queue=queue) result = clf.score(cancer.data, cancer.target, queue=queue) - clf = SklearnNuSVC(kernel='linear', nu=nu) + clf = SklearnNuSVC(kernel="linear", nu=nu) clf.fit(cancer.data, cancer.target) expected = clf.score(cancer.data, cancer.target) @@ -168,8 +170,8 @@ def _test_cancer_linear_compare_with_sklearn(queue, nu): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('nu', [0.25, 0.5]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("nu", [0.25, 0.5]) def test_cancer_linear_compare_with_sklearn(queue, nu): _test_cancer_linear_compare_with_sklearn(queue, nu) @@ -177,11 +179,11 @@ def test_cancer_linear_compare_with_sklearn(queue, nu): def _test_cancer_poly_compare_with_sklearn(queue, params): cancer = datasets.load_breast_cancer() - clf = NuSVC(kernel='poly', **params) + clf = NuSVC(kernel="poly", **params) clf.fit(cancer.data, cancer.target, queue=queue) result = clf.score(cancer.data, cancer.target, queue=queue) - clf = SklearnNuSVC(kernel='poly', **params) + clf = SklearnNuSVC(kernel="poly", **params) clf.fit(cancer.data, cancer.target) expected = clf.score(cancer.data, cancer.target) @@ -190,10 +192,13 @@ def _test_cancer_poly_compare_with_sklearn(queue, params): @pass_if_not_implemented_for_gpu(reason="nusvc is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('params', [ - {'degree': 2, 'coef0': 0.1, 'gamma': 'scale', 'nu': .25}, - {'degree': 3, 'coef0': 0.0, 'gamma': 'scale', 'nu': .5} -]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize( + "params", + [ + {"degree": 2, "coef0": 0.1, "gamma": "scale", "nu": 0.25}, + {"degree": 3, "coef0": 0.0, "gamma": "scale", "nu": 0.5}, + ], +) def test_cancer_poly_compare_with_sklearn(queue, params): _test_cancer_poly_compare_with_sklearn(queue, params) diff --git a/onedal/svm/tests/test_nusvr.py b/onedal/svm/tests/test_nusvr.py index 8dbe608934..fd85317687 100644 --- a/onedal/svm/tests/test_nusvr.py +++ b/onedal/svm/tests/test_nusvr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,58 +12,54 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_equal, assert_allclose, assert_array_almost_equal +import pytest +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal from sklearn import datasets from sklearn.metrics.pairwise import rbf_kernel - -from onedal.svm import NuSVR from sklearn.svm import NuSVR as SklearnNuSVR -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) - +from onedal.svm import NuSVR +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) -synth_params = { - 'n_samples': 500, - 'n_features': 100, - 'random_state': 42 -} +synth_params = {"n_samples": 500, "n_features": 100, "random_state": 42} @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_diabetes_simple(queue): diabetes = datasets.load_diabetes() - clf = NuSVR(kernel='linear', C=10.) + clf = NuSVR(kernel="linear", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) assert clf.score(diabetes.data, diabetes.target, queue=queue) > 0.02 @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_input_format_for_diabetes(queue): diabetes = datasets.load_diabetes() - c_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='C') + c_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="C") assert c_contiguous_numpy.flags.c_contiguous assert not c_contiguous_numpy.flags.f_contiguous assert not c_contiguous_numpy.flags.fnc - clf = NuSVR(kernel='linear', C=10.) + clf = NuSVR(kernel="linear", C=10.0) clf.fit(c_contiguous_numpy, diabetes.target, queue=queue) dual_c_contiguous_numpy = clf.dual_coef_ res_c_contiguous_numpy = clf.predict(c_contiguous_numpy, queue=queue) - f_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='F') + f_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="F") assert not f_contiguous_numpy.flags.c_contiguous assert f_contiguous_numpy.flags.f_contiguous assert f_contiguous_numpy.flags.fnc - clf = NuSVR(kernel='linear', C=10.) + clf = NuSVR(kernel="linear", C=10.0) clf.fit(f_contiguous_numpy, diabetes.target, queue=queue) dual_f_contiguous_numpy = clf.dual_coef_ res_f_contiguous_numpy = clf.predict(f_contiguous_numpy, queue=queue) @@ -72,19 +68,19 @@ def test_input_format_for_diabetes(queue): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_predict(queue): iris = datasets.load_iris() X = iris.data y = iris.target - reg = NuSVR(kernel='linear', C=0.1).fit(X, y, queue=queue) + reg = NuSVR(kernel="linear", C=0.1).fit(X, y, queue=queue) linear = np.dot(X, reg.support_vectors_.T) dec = np.dot(linear, reg.dual_coef_.T) + reg.intercept_ assert_array_almost_equal(dec.ravel(), reg.predict(X, queue=queue).ravel()) - reg = NuSVR(kernel='rbf', gamma=1).fit(X, y, queue=queue) + reg = NuSVR(kernel="rbf", gamma=1).fit(X, y, queue=queue) rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma) dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_ @@ -93,24 +89,25 @@ def test_predict(queue): def _test_diabetes_compare_with_sklearn(queue, kernel): diabetes = datasets.load_diabetes() - clf_onedal = NuSVR(kernel=kernel, nu=.25, C=10.) + clf_onedal = NuSVR(kernel=kernel, nu=0.25, C=10.0) clf_onedal.fit(diabetes.data, diabetes.target, queue=queue) result = clf_onedal.score(diabetes.data, diabetes.target, queue=queue) - clf_sklearn = SklearnNuSVR(kernel=kernel, nu=.25, C=10.) + clf_sklearn = SklearnNuSVR(kernel=kernel, nu=0.25, C=10.0) clf_sklearn.fit(diabetes.data, diabetes.target) expected = clf_sklearn.score(diabetes.data, diabetes.target) assert result > expected - 1e-5 assert_allclose(clf_sklearn.intercept_, clf_onedal.intercept_, atol=1e-3) - assert_allclose(clf_sklearn.support_vectors_.shape, - clf_sklearn.support_vectors_.shape) + assert_allclose( + clf_sklearn.support_vectors_.shape, clf_sklearn.support_vectors_.shape + ) assert_allclose(clf_sklearn.dual_coef_, clf_onedal.dual_coef_, atol=1e-2) @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_diabetes_compare_with_sklearn(queue, kernel): _test_diabetes_compare_with_sklearn(queue, kernel) @@ -118,11 +115,11 @@ def test_diabetes_compare_with_sklearn(queue, kernel): def _test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): x, y = datasets.make_regression(**synth_params) - clf = NuSVR(kernel='rbf', gamma=gamma, C=C, nu=nu) + clf = NuSVR(kernel="rbf", gamma=gamma, C=C, nu=nu) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnNuSVR(kernel='rbf', gamma=gamma, C=C, nu=nu) + clf = SklearnNuSVR(kernel="rbf", gamma=gamma, C=C, nu=nu) clf.fit(x, y) expected = clf.score(x, y) @@ -131,10 +128,10 @@ def _test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', ['scale', 'auto']) -@pytest.mark.parametrize('C', [100.0, 1000.0]) -@pytest.mark.parametrize('nu', [0.25, 0.75]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", ["scale", "auto"]) +@pytest.mark.parametrize("C", [100.0, 1000.0]) +@pytest.mark.parametrize("nu", [0.25, 0.75]) def test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): _test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma) @@ -142,11 +139,11 @@ def test_synth_rbf_compare_with_sklearn(queue, C, nu, gamma): def _test_synth_linear_compare_with_sklearn(queue, C, nu): x, y = datasets.make_regression(**synth_params) - clf = NuSVR(kernel='linear', C=C, nu=nu) + clf = NuSVR(kernel="linear", C=C, nu=nu) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnNuSVR(kernel='linear', C=C, nu=nu) + clf = SklearnNuSVR(kernel="linear", C=C, nu=nu) clf.fit(x, y) expected = clf.score(x, y) @@ -157,9 +154,9 @@ def _test_synth_linear_compare_with_sklearn(queue, C, nu): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('C', [0.001, 0.1]) -@pytest.mark.parametrize('nu', [0.25, 0.75]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("C", [0.001, 0.1]) +@pytest.mark.parametrize("nu", [0.25, 0.75]) def test_synth_linear_compare_with_sklearn(queue, C, nu): _test_synth_linear_compare_with_sklearn(queue, C, nu) @@ -167,11 +164,11 @@ def test_synth_linear_compare_with_sklearn(queue, C, nu): def _test_synth_poly_compare_with_sklearn(queue, params): x, y = datasets.make_regression(**synth_params) - clf = NuSVR(kernel='poly', **params) + clf = NuSVR(kernel="poly", **params) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnNuSVR(kernel='poly', **params) + clf = SklearnNuSVR(kernel="poly", **params) clf.fit(x, y) expected = clf.score(x, y) @@ -180,25 +177,29 @@ def _test_synth_poly_compare_with_sklearn(queue, params): @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('params', [ - {'degree': 2, 'coef0': 0.1, 'gamma': 'scale', 'C': 100, 'nu': .25}, - {'degree': 3, 'coef0': 0.0, 'gamma': 'scale', 'C': 1000, 'nu': .75} -]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize( + "params", + [ + {"degree": 2, "coef0": 0.1, "gamma": "scale", "C": 100, "nu": 0.25}, + {"degree": 3, "coef0": 0.0, "gamma": "scale", "C": 1000, "nu": 0.75}, + ], +) def test_synth_poly_compare_with_sklearn(queue, params): _test_synth_poly_compare_with_sklearn(queue, params) @pass_if_not_implemented_for_gpu(reason="nusvr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): diabetes = datasets.load_diabetes() - clf = NuSVR(kernel='rbf', C=10.) + clf = NuSVR(kernel="rbf", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) expected = clf.predict(diabetes.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) diff --git a/onedal/svm/tests/test_svc.py b/onedal/svm/tests/test_svc.py index a6599a97f0..284a6b20f3 100644 --- a/onedal/svm/tests/test_svc.py +++ b/onedal/svm/tests/test_svc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,23 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_equal, assert_array_almost_equal - -from onedal.svm import SVC - -from sklearn.utils.estimator_checks import check_estimator +import pytest import sklearn.utils.estimator_checks +from numpy.testing import assert_array_almost_equal, assert_array_equal from sklearn import datasets -from sklearn.metrics.pairwise import rbf_kernel from sklearn.datasets import make_blobs +from sklearn.metrics.pairwise import rbf_kernel from sklearn.model_selection import train_test_split +from sklearn.utils.estimator_checks import check_estimator -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import SVC +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) def _replace_and_save(md, fns, replacing_fn): @@ -53,42 +53,52 @@ def dummy(*args, **kwargs): pass md = sklearn.utils.estimator_checks - saved = _replace_and_save(md, [ - 'check_sample_weights_invariance', # Max absolute difference: 0.0008 - 'check_estimators_fit_returns_self', # ValueError: empty metadata - 'check_classifiers_train', # assert y_pred.shape == (n_samples,) - 'check_estimators_unfitted', # Call 'fit' with appropriate arguments - ], dummy) + saved = _replace_and_save( + md, + [ + "check_sample_weights_invariance", # Max absolute difference: 0.0008 + "check_estimators_fit_returns_self", # ValueError: empty metadata + "check_classifiers_train", # assert y_pred.shape == (n_samples,) + "check_estimators_unfitted", # Call 'fit' with appropriate arguments + ], + dummy, + ) check_estimator(SVC()) _restore_from_saved(md, saved) def _test_libsvm_parameters(queue, array_constr, dtype): - X = array_constr([[-2, -1], [-1, -1], [-1, -2], - [1, 1], [1, 2], [2, 1]], dtype=dtype) + X = array_constr([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=dtype) y = array_constr([1, 1, 1, 2, 2, 2], dtype=dtype) - clf = SVC(kernel='linear').fit(X, y, queue=queue) - assert_array_equal(clf.dual_coef_, [[-0.25, .25]]) + clf = SVC(kernel="linear").fit(X, y, queue=queue) + assert_array_equal(clf.dual_coef_, [[-0.25, 0.25]]) assert_array_equal(clf.support_, [1, 3]) assert_array_equal(clf.support_vectors_, (X[1], X[3])) - assert_array_equal(clf.intercept_, [0.]) + assert_array_equal(clf.intercept_, [0.0]) assert_array_equal(clf.predict(X), y) # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) -@pytest.mark.parametrize('array_constr', [np.array]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize("queue", get_queues("cpu")) +@pytest.mark.parametrize("array_constr", [np.array]) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_libsvm_parameters(queue, array_constr, dtype): _test_libsvm_parameters(queue, array_constr, dtype) -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail( - reason="class weights are not implemented " - "but the error is not raised"))]) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="class weights are not implemented " "but the error is not raised" + ), + ) + ], +) def test_class_weight(queue): X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) y = np.array([1, 1, 1, 2, 2, 2]) @@ -99,22 +109,22 @@ def test_class_weight(queue): # TODO: investigate sporadic failures on GPU -@pytest.mark.parametrize('queue', get_queues('cpu')) +@pytest.mark.parametrize("queue", get_queues("cpu")) def test_sample_weight(queue): X = np.array([[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) - clf = SVC(kernel='linear') + clf = SVC(kernel="linear") clf.fit(X, y, sample_weight=[1] * 6, queue=queue) assert_array_almost_equal(clf.intercept_, [0.0]) -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function(queue): X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]], dtype=np.float32) Y = np.array([1, 1, 1, 2, 2, 2], dtype=np.float32) - clf = SVC(kernel='rbf', gamma=1, decision_function_shape='ovo') + clf = SVC(kernel="rbf", gamma=1, decision_function_shape="ovo") clf.fit(X, Y, queue=queue) rbfs = rbf_kernel(X, clf.support_vectors_, gamma=clf.gamma) @@ -123,38 +133,40 @@ def test_decision_function(queue): @pass_if_not_implemented_for_gpu(reason="multiclass svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_iris(queue): iris = datasets.load_iris() - clf = SVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = SVC(kernel="linear").fit(iris.data, iris.target, queue=queue) assert clf.score(iris.data, iris.target, queue=queue) > 0.9 assert_array_equal(clf.classes_, np.sort(clf.classes_)) @pass_if_not_implemented_for_gpu(reason="multiclass svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_decision_function_shape(queue): X, y = make_blobs(n_samples=80, centers=5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # check shape of ovo_decition_function=True - clf = SVC(kernel='linear', - decision_function_shape='ovo').fit(X_train, y_train, queue=queue) + clf = SVC(kernel="linear", decision_function_shape="ovo").fit( + X_train, y_train, queue=queue + ) dec = clf.decision_function(X_train, queue=queue) assert dec.shape == (len(X_train), 10) with pytest.raises(ValueError, match="must be either 'ovr' or 'ovo'"): - SVC(decision_function_shape='bad').fit(X_train, y_train, queue=queue) + SVC(decision_function_shape="bad").fit(X_train, y_train, queue=queue) @pass_if_not_implemented_for_gpu(reason="multiclass svm is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): iris = datasets.load_iris() - clf = SVC(kernel='linear').fit(iris.data, iris.target, queue=queue) + clf = SVC(kernel="linear").fit(iris.data, iris.target, queue=queue) expected = clf.decision_function(iris.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) @@ -164,18 +176,26 @@ def test_pickle(queue): @pass_if_not_implemented_for_gpu(reason="sigmoid kernel is not implemented") -@pytest.mark.parametrize('queue', get_queues('cpu') + [ - pytest.param(get_queues('gpu'), - marks=pytest.mark.xfail(reason="raises Unimplemented error " - "with inconsistent error message"))]) -@pytest.mark.parametrize('dtype', [np.float32, np.float64]) +@pytest.mark.parametrize( + "queue", + get_queues("cpu") + + [ + pytest.param( + get_queues("gpu"), + marks=pytest.mark.xfail( + reason="raises Unimplemented error " "with inconsistent error message" + ), + ) + ], +) +@pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_svc_sigmoid(queue, dtype): - X_train = np.array([[-1, 2], [0, 0], [2, -1], - [+1, +1], [+1, +2], [+2, +1]], dtype=dtype) - X_test = np.array([[0, 2], [0.5, 0.5], - [0.3, 0.1], [2, 0], [-1, -1]], dtype=dtype) + X_train = np.array( + [[-1, 2], [0, 0], [2, -1], [+1, +1], [+1, +2], [+2, +1]], dtype=dtype + ) + X_test = np.array([[0, 2], [0.5, 0.5], [0.3, 0.1], [2, 0], [-1, -1]], dtype=dtype) y_train = np.array([1, 1, 1, 2, 2, 2], dtype=dtype) - svc = SVC(kernel='sigmoid').fit(X_train, y_train, queue=queue) + svc = SVC(kernel="sigmoid").fit(X_train, y_train, queue=queue) assert_array_equal(svc.dual_coef_, [[-1, -1, -1, 1, 1, 1]]) assert_array_equal(svc.support_, [0, 1, 2, 3, 4, 5]) diff --git a/onedal/svm/tests/test_svr.py b/onedal/svm/tests/test_svr.py index f03d6813b7..6ad7822d4a 100644 --- a/onedal/svm/tests/test_svr.py +++ b/onedal/svm/tests/test_svr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,29 +12,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import numpy as np -from numpy.testing import assert_array_almost_equal, assert_allclose, assert_array_equal +import pytest +import sklearn.utils.estimator_checks +from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal from sklearn import datasets from sklearn.metrics.pairwise import rbf_kernel - -from onedal.svm import SVR from sklearn.svm import SVR as SklearnSVR - from sklearn.utils.estimator_checks import check_estimator -import sklearn.utils.estimator_checks - -from onedal.tests.utils._device_selection import (get_queues, - pass_if_not_implemented_for_gpu) +from onedal.svm import SVR +from onedal.tests.utils._device_selection import ( + get_queues, + pass_if_not_implemented_for_gpu, +) -synth_params = { - 'n_samples': 500, - 'n_features': 100, - 'random_state': 42 -} +synth_params = {"n_samples": 500, "n_features": 100, "random_state": 42} def _replace_and_save(md, fns, replacing_fn): @@ -59,25 +54,29 @@ def dummy(*args, **kwargs): pass md = sklearn.utils.estimator_checks - saved = _replace_and_save(md, [ - 'check_sample_weights_invariance', # Max absolute difference: 0.0002 - 'check_estimators_fit_returns_self', # ??? - 'check_regressors_train', # Cannot get data type from empty metadata - 'check_estimators_unfitted', # expected NotFittedError from sklearn - ], dummy) + saved = _replace_and_save( + md, + [ + "check_sample_weights_invariance", # Max absolute difference: 0.0002 + "check_estimators_fit_returns_self", # ??? + "check_regressors_train", # Cannot get data type from empty metadata + "check_estimators_unfitted", # expected NotFittedError from sklearn + ], + dummy, + ) check_estimator(SVR()) _restore_from_saved(md, saved) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_run_to_run_fit(queue): diabetes = datasets.load_diabetes() - clf_first = SVR(kernel='linear', C=10.) + clf_first = SVR(kernel="linear", C=10.0) clf_first.fit(diabetes.data, diabetes.target, queue=queue) for _ in range(10): - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) assert_allclose(clf_first.intercept_, clf.intercept_) assert_allclose(clf_first.support_vectors_, clf.support_vectors_) @@ -85,35 +84,35 @@ def test_run_to_run_fit(queue): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_diabetes_simple(queue): diabetes = datasets.load_diabetes() - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) assert clf.score(diabetes.data, diabetes.target, queue=queue) > 0.02 @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_input_format_for_diabetes(queue): diabetes = datasets.load_diabetes() - c_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='C') + c_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="C") assert c_contiguous_numpy.flags.c_contiguous assert not c_contiguous_numpy.flags.f_contiguous assert not c_contiguous_numpy.flags.fnc - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(c_contiguous_numpy, diabetes.target, queue=queue) dual_c_contiguous_numpy = clf.dual_coef_ res_c_contiguous_numpy = clf.predict(c_contiguous_numpy, queue=queue) - f_contiguous_numpy = np.asanyarray(diabetes.data, dtype='float', order='F') + f_contiguous_numpy = np.asanyarray(diabetes.data, dtype="float", order="F") assert not f_contiguous_numpy.flags.c_contiguous assert f_contiguous_numpy.flags.f_contiguous assert f_contiguous_numpy.flags.fnc - clf = SVR(kernel='linear', C=10.) + clf = SVR(kernel="linear", C=10.0) clf.fit(f_contiguous_numpy, diabetes.target, queue=queue) dual_f_contiguous_numpy = clf.dual_coef_ res_f_contiguous_numpy = clf.predict(f_contiguous_numpy, queue=queue) @@ -122,19 +121,19 @@ def test_input_format_for_diabetes(queue): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_predict(queue): iris = datasets.load_iris() X = iris.data y = iris.target - reg = SVR(kernel='linear', C=0.1).fit(X, y, queue=queue) + reg = SVR(kernel="linear", C=0.1).fit(X, y, queue=queue) linear = np.dot(X, reg.support_vectors_.T) dec = np.dot(linear, reg.dual_coef_.T) + reg.intercept_ assert_array_almost_equal(dec.ravel(), reg.predict(X, queue=queue).ravel()) - reg = SVR(kernel='rbf', gamma=1).fit(X, y, queue=queue) + reg = SVR(kernel="rbf", gamma=1).fit(X, y, queue=queue) rbfs = rbf_kernel(X, reg.support_vectors_, gamma=reg.gamma) dec = np.dot(rbfs, reg.dual_coef_.T) + reg.intercept_ @@ -143,35 +142,36 @@ def test_predict(queue): def _test_diabetes_compare_with_sklearn(queue, kernel): diabetes = datasets.load_diabetes() - clf_onedal = SVR(kernel=kernel, C=10., gamma=2) + clf_onedal = SVR(kernel=kernel, C=10.0, gamma=2) clf_onedal.fit(diabetes.data, diabetes.target, queue=queue) result = clf_onedal.score(diabetes.data, diabetes.target, queue=queue) - clf_sklearn = SklearnSVR(kernel=kernel, C=10., gamma=2) + clf_sklearn = SklearnSVR(kernel=kernel, C=10.0, gamma=2) clf_sklearn.fit(diabetes.data, diabetes.target) expected = clf_sklearn.score(diabetes.data, diabetes.target) assert result > expected - 1e-5 assert_allclose(clf_sklearn.intercept_, clf_onedal.intercept_, atol=1e-3) - assert_allclose(clf_sklearn.support_vectors_.shape, - clf_sklearn.support_vectors_.shape) + assert_allclose( + clf_sklearn.support_vectors_.shape, clf_sklearn.support_vectors_.shape + ) assert_allclose(clf_sklearn.dual_coef_, clf_onedal.dual_coef_, atol=1e-1) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('kernel', ['linear', 'rbf', 'poly', 'sigmoid']) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("kernel", ["linear", "rbf", "poly", "sigmoid"]) def test_diabetes_compare_with_sklearn(queue, kernel): _test_diabetes_compare_with_sklearn(queue, kernel) def _test_synth_rbf_compare_with_sklearn(queue, C, gamma): x, y = datasets.make_regression(**synth_params) - clf = SVR(kernel='rbf', gamma=gamma, C=C) + clf = SVR(kernel="rbf", gamma=gamma, C=C) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnSVR(kernel='rbf', gamma=gamma, C=C) + clf = SklearnSVR(kernel="rbf", gamma=gamma, C=C) clf.fit(x, y) expected = clf.score(x, y) @@ -180,20 +180,20 @@ def _test_synth_rbf_compare_with_sklearn(queue, C, gamma): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('gamma', ['scale', 'auto']) -@pytest.mark.parametrize('C', [100.0, 1000.0]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("gamma", ["scale", "auto"]) +@pytest.mark.parametrize("C", [100.0, 1000.0]) def test_synth_rbf_compare_with_sklearn(queue, C, gamma): _test_synth_rbf_compare_with_sklearn(queue, C, gamma) def _test_synth_linear_compare_with_sklearn(queue, C): x, y = datasets.make_regression(**synth_params) - clf = SVR(kernel='linear', C=C) + clf = SVR(kernel="linear", C=C) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnSVR(kernel='linear', C=C) + clf = SklearnSVR(kernel="linear", C=C) clf.fit(x, y) expected = clf.score(x, y) @@ -204,19 +204,19 @@ def _test_synth_linear_compare_with_sklearn(queue, C): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('C', [0.001, 0.1]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize("C", [0.001, 0.1]) def test_synth_linear_compare_with_sklearn(queue, C): _test_synth_linear_compare_with_sklearn(queue, C) def _test_synth_poly_compare_with_sklearn(queue, params): x, y = datasets.make_regression(**synth_params) - clf = SVR(kernel='poly', **params) + clf = SVR(kernel="poly", **params) clf.fit(x, y, queue=queue) result = clf.score(x, y, queue=queue) - clf = SklearnSVR(kernel='poly', **params) + clf = SklearnSVR(kernel="poly", **params) clf.fit(x, y) expected = clf.score(x, y) @@ -225,48 +225,52 @@ def _test_synth_poly_compare_with_sklearn(queue, params): @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) -@pytest.mark.parametrize('params', [ - {'degree': 2, 'coef0': 0.1, 'gamma': 'scale', 'C': 100}, - {'degree': 3, 'coef0': 0.0, 'gamma': 'scale', 'C': 1000} -]) +@pytest.mark.parametrize("queue", get_queues()) +@pytest.mark.parametrize( + "params", + [ + {"degree": 2, "coef0": 0.1, "gamma": "scale", "C": 100}, + {"degree": 3, "coef0": 0.0, "gamma": "scale", "C": 1000}, + ], +) def test_synth_poly_compare_with_sklearn(queue, params): _test_synth_poly_compare_with_sklearn(queue, params) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_sided_sample_weight(queue): - clf = SVR(C=1e-2, kernel='linear') + clf = SVR(C=1e-2, kernel="linear") X = [[-2, 0], [-1, -1], [0, -2], [0, 2], [1, 1], [2, 0]] Y = [1, 1, 1, 2, 2, 2] - sample_weight = [10., .1, .1, .1, .1, 10] + sample_weight = [10.0, 0.1, 0.1, 0.1, 0.1, 10] clf.fit(X, Y, sample_weight=sample_weight, queue=queue) - y_pred = clf.predict([[-1., 1.]], queue=queue) + y_pred = clf.predict([[-1.0, 1.0]], queue=queue) assert y_pred < 1.5 - sample_weight = [1., .1, 10., 10., .1, .1] + sample_weight = [1.0, 0.1, 10.0, 10.0, 0.1, 0.1] clf.fit(X, Y, sample_weight=sample_weight, queue=queue) - y_pred = clf.predict([[-1., 1.]], queue=queue) + y_pred = clf.predict([[-1.0, 1.0]], queue=queue) assert y_pred > 1.5 sample_weight = [1] * 6 clf.fit(X, Y, sample_weight=sample_weight, queue=queue) - y_pred = clf.predict([[-1., 1.]], queue=queue) + y_pred = clf.predict([[-1.0, 1.0]], queue=queue) assert y_pred == pytest.approx(1.5) @pass_if_not_implemented_for_gpu(reason="svr is not implemented") -@pytest.mark.parametrize('queue', get_queues()) +@pytest.mark.parametrize("queue", get_queues()) def test_pickle(queue): diabetes = datasets.load_diabetes() - clf = SVR(kernel='rbf', C=10.) + clf = SVR(kernel="rbf", C=10.0) clf.fit(diabetes.data, diabetes.target, queue=queue) expected = clf.predict(diabetes.data, queue=queue) import pickle + dump = pickle.dumps(clf) clf2 = pickle.loads(dump) diff --git a/onedal/tests/utils/_device_selection.py b/onedal/tests/utils/_device_selection.py index fcac7d12cd..73fc060ead 100644 --- a/onedal/tests/utils/_device_selection.py +++ b/onedal/tests/utils/_device_selection.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,22 +12,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest import functools +import pytest + -def get_queues(filter_='cpu,gpu'): +def get_queues(filter_="cpu,gpu"): queues = [] try: import dpctl - if dpctl.has_cpu_devices and 'cpu' in filter_: - queues.append(dpctl.SyclQueue('cpu')) - if dpctl.has_gpu_devices and 'gpu' in filter_: - queues.append(dpctl.SyclQueue('gpu')) + if dpctl.has_cpu_devices and "cpu" in filter_: + queues.append(dpctl.SyclQueue("cpu")) + if dpctl.has_gpu_devices and "gpu" in filter_: + queues.append(dpctl.SyclQueue("gpu")) finally: return queues @@ -35,6 +36,7 @@ def get_queues(filter_='cpu,gpu'): def get_memory_usm(): try: from dpctl.memory import MemoryUSMDevice, MemoryUSMShared + return [MemoryUSMDevice, MemoryUSMShared] except ImportError: return [] @@ -47,9 +49,9 @@ def is_dpctl_available(targets=None): if targets is None: return True for device in targets: - if device == 'cpu' and not dpctl.has_cpu_devices(): + if device == "cpu" and not dpctl.has_cpu_devices(): return False - if device == 'gpu' and not dpctl.has_gpu_devices(): + if device == "gpu" and not dpctl.has_gpu_devices(): return False return True except ImportError: @@ -58,14 +60,15 @@ def is_dpctl_available(targets=None): def device_type_to_str(queue): if queue is None: - return 'cpu' + return "cpu" from dpctl import device_type + if queue.sycl_device.device_type == device_type.cpu: - return 'cpu' + return "cpu" if queue.sycl_device.device_type == device_type.gpu: - return 'gpu' - return 'unknown' + return "gpu" + return "unknown" def pass_if_not_implemented_for_gpu(reason=""): @@ -75,9 +78,11 @@ def decorator(test): @functools.wraps(test) def wrapper(queue, *args, **kwargs): if queue is not None and queue.sycl_device.is_gpu: - with pytest.raises(RuntimeError, match='is not implemented for GPU'): + with pytest.raises(RuntimeError, match="is not implemented for GPU"): test(queue, *args, **kwargs) else: test(queue, *args, **kwargs) + return wrapper + return decorator diff --git a/onedal/utils/__init__.py b/onedal/utils/__init__.py index 5803d2b8d2..ed23be4782 100644 --- a/onedal/utils/__init__.py +++ b/onedal/utils/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,26 +12,36 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .validation import ( - _column_or_1d, - _validate_targets, - _check_X_y, _check_array, _check_classification_targets, - _type_of_target, + _check_n_features, + _check_X_y, + _column_or_1d, + _is_arraylike, + _is_arraylike_not_scalar, _is_integral_float, _is_multilabel, - _check_n_features, _num_features, _num_samples, - _is_arraylike, - _is_arraylike_not_scalar + _type_of_target, + _validate_targets, ) -__all__ = ['_column_or_1d', '_validate_targets', '_check_X_y', - '_check_array', '_check_classification_targets', - '_type_of_target', '_is_integral_float', - '_is_multilabel', '_check_n_features', '_num_features', - '_num_samples', '_is_arraylike', '_is_arraylike_not_scalar'] +__all__ = [ + "_column_or_1d", + "_validate_targets", + "_check_X_y", + "_check_array", + "_check_classification_targets", + "_type_of_target", + "_is_integral_float", + "_is_multilabel", + "_check_n_features", + "_num_features", + "_num_samples", + "_is_arraylike", + "_is_arraylike_not_scalar", +] diff --git a/onedal/utils/validation.py b/onedal/utils/validation.py index ce3fa8a9b7..b163873a13 100644 --- a/onedal/utils/validation.py +++ b/onedal/utils/validation.py @@ -14,20 +14,21 @@ # limitations under the License. # =============================================================================== -import numpy as np import warnings +from collections.abc import Sequence +from numbers import Integral + +import numpy as np from scipy import sparse as sp -from scipy.sparse import issparse, dok_matrix, lil_matrix +from scipy.sparse import dok_matrix, issparse, lil_matrix from sklearn.preprocessing import LabelEncoder from sklearn.utils.validation import check_array -from collections.abc import Sequence -from numbers import Integral + from daal4py.sklearn.utils.validation import _assert_all_finite class DataConversionWarning(UserWarning): - """Warning used to notify implicit data conversions happening in the code. - """ + """Warning used to notify implicit data conversions happening in the code.""" def _is_arraylike(x): @@ -52,24 +53,26 @@ def _column_or_1d(y, warn=False): return np.ravel(y) if len(shape) == 2 and shape[1] == 1: if warn: - warnings.warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples, ), for example using ravel().", - DataConversionWarning, stacklevel=2) + warnings.warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples, ), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) return np.ravel(y) raise ValueError( - "y should be a 1d array, " - "got an array of shape {} instead.".format(shape)) + "y should be a 1d array, " "got an array of shape {} instead.".format(shape) + ) def _compute_class_weight(class_weight, classes, y): if set(y) - set(classes): - raise ValueError("classes should include all valid labels that can " - "be in y") + raise ValueError("classes should include all valid labels that can " "be in y") if class_weight is None or len(class_weight) == 0: - weight = np.ones(classes.shape[0], dtype=np.float64, order='C') - elif class_weight == 'balanced': + weight = np.ones(classes.shape[0], dtype=np.float64, order="C") + elif class_weight == "balanced": y_ = _column_or_1d(y) classes, _ = np.unique(y_, return_inverse=True) @@ -82,10 +85,12 @@ def _compute_class_weight(class_weight, classes, y): weight = len(y_) / (len(le.classes_) * y_bin) else: # user-defined dictionary - weight = np.ones(classes.shape[0], dtype=np.float64, order='C') + weight = np.ones(classes.shape[0], dtype=np.float64, order="C") if not isinstance(class_weight, dict): - raise ValueError("class_weight must be dict, 'balanced', or None," - " got: %r" % class_weight) + raise ValueError( + "class_weight must be dict, 'balanced', or None," + " got: %r" % class_weight + ) for c in class_weight: i = np.searchsorted(classes, c) if i >= len(classes) or classes[i] != c: @@ -99,23 +104,30 @@ def _validate_targets(y, class_weight, dtype): y_ = _column_or_1d(y, warn=True) _check_classification_targets(y) classes, y = np.unique(y_, return_inverse=True) - class_weight_res = _compute_class_weight(class_weight, - classes=classes, y=y_) + class_weight_res = _compute_class_weight(class_weight, classes=classes, y=y_) if len(classes) < 2: raise ValueError( "The number of classes has to be greater than one; got %d" - " class" % len(classes)) - - return np.asarray(y, dtype=dtype, order='C'), class_weight_res, classes - - -def _check_array(array, dtype="numeric", accept_sparse=False, order=None, - copy=False, force_all_finite=True, - ensure_2d=True, accept_large_sparse=True): + " class" % len(classes) + ) + + return np.asarray(y, dtype=dtype, order="C"), class_weight_res, classes + + +def _check_array( + array, + dtype="numeric", + accept_sparse=False, + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + accept_large_sparse=True, +): if force_all_finite: if sp.issparse(array): - if hasattr(array, 'data'): + if hasattr(array, "data"): _assert_all_finite(array.data) force_all_finite = False else: @@ -129,7 +141,8 @@ def _check_array(array, dtype="numeric", accept_sparse=False, order=None, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d, - accept_large_sparse=accept_large_sparse) + accept_large_sparse=accept_large_sparse, + ) if sp.isspmatrix(array): return array @@ -146,68 +159,82 @@ def _check_array(array, dtype="numeric", accept_sparse=False, order=None, def _check_X_y( - X, - y, - dtype="numeric", - accept_sparse=False, - order=None, - copy=False, - force_all_finite=True, - ensure_2d=True, - accept_large_sparse=True, - y_numeric=False, - accept_2d_y=False): + X, + y, + dtype="numeric", + accept_sparse=False, + order=None, + copy=False, + force_all_finite=True, + ensure_2d=True, + accept_large_sparse=True, + y_numeric=False, + accept_2d_y=False, +): if y is None: raise ValueError("y cannot be None") - X = _check_array(X, accept_sparse=accept_sparse, - dtype=dtype, order=order, copy=copy, - force_all_finite=force_all_finite, - ensure_2d=ensure_2d, - accept_large_sparse=accept_large_sparse) + X = _check_array( + X, + accept_sparse=accept_sparse, + dtype=dtype, + order=order, + copy=copy, + force_all_finite=force_all_finite, + ensure_2d=ensure_2d, + accept_large_sparse=accept_large_sparse, + ) if not accept_2d_y: y = _column_or_1d(y, warn=True) - if y_numeric and y.dtype.kind == 'O': + if y_numeric and y.dtype.kind == "O": y = y.astype(np.float64) _assert_all_finite(y) lengths = [X.shape[0], y.shape[0]] uniques = np.unique(lengths) if len(uniques) > 1: - raise ValueError("Found input variables with inconsistent numbers of" - " samples: %r" % [int(length) for length in lengths]) + raise ValueError( + "Found input variables with inconsistent numbers of" + " samples: %r" % [int(length) for length in lengths] + ) return X, y def _check_classification_targets(y): y_type = _type_of_target(y) - if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', - 'multilabel-indicator', 'multilabel-sequences']: + if y_type not in [ + "binary", + "multiclass", + "multiclass-multioutput", + "multilabel-indicator", + "multilabel-sequences", + ]: raise ValueError("Unknown label type: %r" % y_type) def _type_of_target(y): - is_sequence, is_array = isinstance(y, Sequence), hasattr(y, '__array__') + is_sequence, is_array = isinstance(y, Sequence), hasattr(y, "__array__") is_not_string, is_spmatrix = not isinstance(y, str), sp.isspmatrix(y) valid = (is_sequence or is_array or is_spmatrix) and is_not_string if not valid: - raise ValueError('Expected array-like (array or non-string sequence), ' - 'got %r' % y) + raise ValueError( + "Expected array-like (array or non-string sequence), " "got %r" % y + ) - sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) + sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"] if sparse_pandas: raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if _is_multilabel(y): - return 'multilabel-indicator' + return "multilabel-indicator" # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): - warnings.simplefilter('error', np.VisibleDeprecationWarning) + warnings.simplefilter("error", np.VisibleDeprecationWarning) try: y = np.asarray(y) except np.VisibleDeprecationWarning: @@ -217,23 +244,27 @@ def _type_of_target(y): # The old sequence of sequences format try: - if not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) \ - and not isinstance(y[0], str): - raise ValueError('You appear to be using a legacy multi-label data' - ' representation. Sequence of sequences are no' - ' longer supported; use a binary array or sparse' - ' matrix instead - the MultiLabelBinarizer' - ' transformer can convert to this format.') + if ( + not hasattr(y[0], "__array__") + and isinstance(y[0], Sequence) + and not isinstance(y[0], str) + ): + raise ValueError( + "You appear to be using a legacy multi-label data" + " representation. Sequence of sequences are no" + " longer supported; use a binary array or sparse" + " matrix instead - the MultiLabelBinarizer" + " transformer can convert to this format." + ) except IndexError: pass # Invalid inputs - if y.ndim > 2 or (y.dtype == object and len( - y) and not isinstance(y.flat[0], str)): - return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] + if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)): + return "unknown" # [[[1, 2]]] or [obj_1] and not ["label_1"] if y.ndim == 2 and y.shape[1] == 0: - return 'unknown' # [[]] + return "unknown" # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] @@ -241,26 +272,26 @@ def _type_of_target(y): suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values - if y.dtype.kind == 'f' and np.any(y != y.astype(int)): + if y.dtype.kind == "f" and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] _assert_all_finite(y) - return 'continuous' + suffix + return "continuous" + suffix if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): - return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] - return 'binary' # [1, 2] or [["a"], ["b"]] + return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] + return "binary" # [1, 2] or [["a"], ["b"]] def _is_integral_float(y): - return y.dtype.kind == 'f' and np.all(y.astype(int) == y) + return y.dtype.kind == "f" and np.all(y.astype(int) == y) def _is_multilabel(y): - if hasattr(y, '__array__') or isinstance(y, Sequence): + if hasattr(y, "__array__") or isinstance(y, Sequence): # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): - warnings.simplefilter('error', np.VisibleDeprecationWarning) + warnings.simplefilter("error", np.VisibleDeprecationWarning) try: y = np.asarray(y) except np.VisibleDeprecationWarning: @@ -274,12 +305,14 @@ def _is_multilabel(y): if issparse(y): if isinstance(y, (dok_matrix, lil_matrix)): y = y.tocsr() - return len(y.data) == 0 or np.unique(y.data).size == 1 and \ - (y.dtype.kind in 'biu' or _is_integral_float(np.unique(y.data))) + return ( + len(y.data) == 0 + or np.unique(y.data).size == 1 + and (y.dtype.kind in "biu" or _is_integral_float(np.unique(y.data))) + ) labels = np.unique(y) - return len(labels) < 3 and ( - y.dtype.kind in 'biu' or _is_integral_float(labels)) + return len(labels) < 3 and (y.dtype.kind in "biu" or _is_integral_float(labels)) def _check_n_features(self, X, reset): @@ -309,7 +342,8 @@ def _check_n_features(self, X, reset): if n_features != self.n_features_in_: raise ValueError( f"X has {n_features} features, but {self.__class__.__name__} " - f"is expecting {self.n_features_in_} features as input.") + f"is expecting {self.n_features_in_} features as input." + ) def _num_features(X, fallback_1d=False): @@ -318,20 +352,17 @@ def _num_features(X, fallback_1d=False): type_name = type_.__qualname__ else: type_name = f"{type_.__module__}.{type_.__qualname__}" - message = ( - "Unable to find the number of features from X of type " - f"{type_name}" - ) - if not hasattr(X, '__len__') and not hasattr(X, 'shape'): - if not hasattr(X, '__array__'): + message = "Unable to find the number of features from X of type " f"{type_name}" + if not hasattr(X, "__len__") and not hasattr(X, "shape"): + if not hasattr(X, "__array__"): raise TypeError(message) # Only convert X to a numpy array if there is no cheaper, heuristic # option. X = np.asarray(X) - if hasattr(X, 'shape'): + if hasattr(X, "shape"): ndim_thr = 1 if fallback_1d else 2 - if not hasattr(X.shape, '__len__') or len(X.shape) < ndim_thr: + if not hasattr(X.shape, "__len__") or len(X.shape) < ndim_thr: message += f" with shape {X.shape}" raise TypeError(message) return X.shape[-1] @@ -340,15 +371,14 @@ def _num_features(X, fallback_1d=False): # Do not consider an array-like of strings or dicts to be a 2D array if isinstance(first_sample, (str, bytes, dict)): - message += (f" where the samples are of type " - f"{type(first_sample).__qualname__}") + message += f" where the samples are of type " f"{type(first_sample).__qualname__}" raise TypeError(message) try: # If X is a list of lists, for instance, we assume that all nested # lists have the same length without checking or converting to # a numpy array to keep this function call as cheap as possible. - if (not fallback_1d) or hasattr(first_sample, '__len__'): + if (not fallback_1d) or hasattr(first_sample, "__len__"): return len(first_sample) else: return 1 @@ -371,8 +401,8 @@ def _num_samples(x): if hasattr(x, "shape") and x.shape is not None: if len(x.shape) == 0: raise TypeError( - "Singleton array %r cannot be considered a valid collection." % - x) + "Singleton array %r cannot be considered a valid collection." % x + ) # Check that shape is returning an integer or default to len # Dask dataframes may not return numeric shape[0] value if hasattr(x, "shape") and isinstance(x.shape[0], Integral): diff --git a/sklearnex/__main__.py b/sklearnex/__main__.py index f64f90402f..8fc1bbdaeb 100755 --- a/sklearnex/__main__.py +++ b/sklearnex/__main__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sys + from sklearnex import patch_sklearn @@ -29,27 +30,30 @@ def _main(): scikit-learn, optimizing solvers of scikit-learn with Intel(R) oneAPI Data Analytics Library. """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument('-m', action='store_true', dest='module', - help="Executes following as a module") - parser.add_argument('name', help="Script or module name") - parser.add_argument('args', nargs=argparse.REMAINDER, - help="Command line arguments") + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "-m", action="store_true", dest="module", help="Executes following as a module" + ) + parser.add_argument("name", help="Script or module name") + parser.add_argument("args", nargs=argparse.REMAINDER, help="Command line arguments") args = parser.parse_args() try: import sklearn + patch_sklearn() except ImportError: print("Scikit-learn could not be imported. Nothing to patch") sys.argv = [args.name] + args.args - if '_' + args.name in globals(): - return globals()['_' + args.name](*args.args) + if "_" + args.name in globals(): + return globals()["_" + args.name](*args.args) import runpy + runf = runpy.run_module if args.module else runpy.run_path - runf(args.name, run_name='__main__') + runf(args.name, run_name="__main__") sys.exit(_main()) diff --git a/sklearnex/_config.py b/sklearnex/_config.py index 6bba89145a..fa85762589 100644 --- a/sklearnex/_config.py +++ b/sklearnex/_config.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import threading from contextlib import contextmanager diff --git a/sklearnex/_device_offload.py b/sklearnex/_device_offload.py index 2d1085f225..f0177e4cd3 100644 --- a/sklearnex/_device_offload.py +++ b/sklearnex/_device_offload.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,44 +12,49 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from ._config import get_config -from ._utils import get_patch_message +import logging +import sys from functools import wraps + import numpy as np -import sys -import logging + +from ._config import get_config +from ._utils import get_patch_message try: from dpctl import SyclQueue from dpctl.memory import MemoryUSMDevice, as_usm_memory from dpctl.tensor import usm_ndarray + dpctl_available = True except ImportError: dpctl_available = False -oneapi_is_available = 'daal4py.oneapi' in sys.modules +oneapi_is_available = "daal4py.oneapi" in sys.modules if oneapi_is_available: from daal4py.oneapi import _get_device_name_sycl_ctxt, _get_sycl_ctxt_params class DummySyclQueue: - '''This class is designed to act like dpctl.SyclQueue - to allow device dispatching in scenarios when dpctl is not available''' + """This class is designed to act like dpctl.SyclQueue + to allow device dispatching in scenarios when dpctl is not available""" class DummySyclDevice: def __init__(self, filter_string): self._filter_string = filter_string - self.is_cpu = 'cpu' in filter_string - self.is_gpu = 'gpu' in filter_string + self.is_cpu = "cpu" in filter_string + self.is_gpu = "gpu" in filter_string # TODO: check for possibility of fp64 support # on other devices in this dummy class self.has_aspect_fp64 = self.is_cpu if not (self.is_cpu): - logging.warning("Device support is limited. " - "Please install dpctl for full experience") + logging.warning( + "Device support is limited. " + "Please install dpctl for full experience" + ) def get_filter_string(self): return self._filter_string @@ -65,23 +70,26 @@ def _get_device_info_from_daal4py(): def _get_global_queue(): - target = get_config()['target_offload'] + target = get_config()["target_offload"] d4p_target, _ = _get_device_info_from_daal4py() - if d4p_target == 'host': - d4p_target = 'cpu' + if d4p_target == "host": + d4p_target = "cpu" QueueClass = DummySyclQueue if not dpctl_available else SyclQueue - if target != 'auto': - if d4p_target is not None and \ - d4p_target != target: + if target != "auto": + if d4p_target is not None and d4p_target != target: if not isinstance(target, str): if d4p_target not in target.sycl_device.get_filter_string(): - raise RuntimeError("Cannot use target offload option " - "inside daal4py.oneapi.sycl_context") + raise RuntimeError( + "Cannot use target offload option " + "inside daal4py.oneapi.sycl_context" + ) else: - raise RuntimeError("Cannot use target offload option " - "inside daal4py.oneapi.sycl_context") + raise RuntimeError( + "Cannot use target offload option " + "inside daal4py.oneapi.sycl_context" + ) if isinstance(target, QueueClass): return target return QueueClass(target) @@ -95,22 +103,25 @@ def _transfer_to_host(queue, *data): host_data = [] for item in data: - usm_iface = getattr(item, '__sycl_usm_array_interface__', None) + usm_iface = getattr(item, "__sycl_usm_array_interface__", None) if usm_iface is not None: if not dpctl_available: - raise RuntimeError("dpctl need to be installed to work " - "with __sycl_usm_array_interface__") + raise RuntimeError( + "dpctl need to be installed to work " + "with __sycl_usm_array_interface__" + ) if queue is not None: - if queue.sycl_device != usm_iface['syclobj'].sycl_device: - raise RuntimeError('Input data shall be located ' - 'on single target device') + if queue.sycl_device != usm_iface["syclobj"].sycl_device: + raise RuntimeError( + "Input data shall be located " "on single target device" + ) else: - queue = usm_iface['syclobj'] + queue = usm_iface["syclobj"] buffer = as_usm_memory(item).copy_to_host() - item = np.ndarray(shape=usm_iface['shape'], - dtype=usm_iface['typestr'], - buffer=buffer) + item = np.ndarray( + shape=usm_iface["shape"], dtype=usm_iface["typestr"], buffer=buffer + ) has_usm_data = True else: has_host_data = True @@ -119,7 +130,7 @@ def _transfer_to_host(queue, *data): mismatch_usm_item = usm_iface is not None and has_host_data if mismatch_host_item or mismatch_usm_item: - raise RuntimeError('Input data shall be located on single target device') + raise RuntimeError("Input data shall be located on single target device") host_data.append(item) return queue, host_data @@ -129,20 +140,22 @@ def _get_backend(obj, queue, method_name, *data): cpu_device = queue is None or queue.sycl_device.is_cpu gpu_device = queue is not None and queue.sycl_device.is_gpu - if (cpu_device and obj._onedal_cpu_supported(method_name, *data)) or \ - (gpu_device and obj._onedal_gpu_supported(method_name, *data)): - return 'onedal', queue + if (cpu_device and obj._onedal_cpu_supported(method_name, *data)) or ( + gpu_device and obj._onedal_gpu_supported(method_name, *data) + ): + return "onedal", queue if cpu_device: - return 'sklearn', None + return "sklearn", None _, d4p_options = _get_device_info_from_daal4py() - allow_fallback_to_host = get_config()['allow_fallback_to_host'] or \ - d4p_options.get('host_offload_on_fail', False) + allow_fallback_to_host = get_config()["allow_fallback_to_host"] or d4p_options.get( + "host_offload_on_fail", False + ) if gpu_device and allow_fallback_to_host: if obj._onedal_cpu_supported(method_name, *data): - return 'onedal', None - return 'sklearn', None + return "onedal", None + return "sklearn", None raise RuntimeError("Device support is not implemented") @@ -155,18 +168,20 @@ def dispatch(obj, method_name, branches, *args, **kwargs): backend, q = _get_backend(obj, q, method_name, *hostargs) - if backend == 'onedal': + if backend == "onedal": return branches[backend](obj, *hostargs, **hostkwargs, queue=q) - if backend == 'sklearn': + if backend == "sklearn": return branches[backend](obj, *hostargs, **hostkwargs) - raise RuntimeError(f'Undefined backend {backend} in ' - f'{obj.__class__.__name__}.{method_name}') + raise RuntimeError( + f"Undefined backend {backend} in " f"{obj.__class__.__name__}.{method_name}" + ) def _copy_to_usm(queue, array): if not dpctl_available: - raise RuntimeError("dpctl need to be installed to work " - "with __sycl_usm_array_interface__") + raise RuntimeError( + "dpctl need to be installed to work " "with __sycl_usm_array_interface__" + ) mem = MemoryUSMDevice(array.nbytes, queue=queue) mem.copy_from_host(array.tobytes()) return usm_ndarray(array.shape, array.dtype, buffer=mem) @@ -179,9 +194,10 @@ def wrapper(self, *args, **kwargs): if len(data) == 0: usm_iface = None else: - usm_iface = getattr(data[0], '__sycl_usm_array_interface__', None) + usm_iface = getattr(data[0], "__sycl_usm_array_interface__", None) result = func(self, *args, **kwargs) if usm_iface is not None: - return _copy_to_usm(usm_iface['syclobj'], result) + return _copy_to_usm(usm_iface["syclobj"], result) return result + return wrapper diff --git a/sklearnex/_utils.py b/sklearnex/_utils.py index 7f0d32a41b..dc7dae5365 100755 --- a/sklearnex/_utils.py +++ b/sklearnex/_utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,21 +13,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import logging -import warnings import os import sys +import warnings + from daal4py.sklearn._utils import daal_check_version def set_sklearn_ex_verbose(): log_level = os.environ.get("SKLEARNEX_VERBOSE") - logger = logging.getLogger('sklearnex') + logger = logging.getLogger("sklearnex") logging_channel = logging.StreamHandler() - logging_formatter = logging.Formatter('%(levelname)s:%(name)s: %(message)s') + logging_formatter = logging.Formatter("%(levelname)s:%(name)s: %(message)s") logging_channel.setFormatter(logging_formatter) logger.addHandler(logging_channel) @@ -35,9 +36,11 @@ def set_sklearn_ex_verbose(): if log_level is not None: logger.setLevel(log_level) except Exception: - warnings.warn('Unknown level "{}" for logging.\n' - 'Please, use one of "CRITICAL", "ERROR", ' - '"WARNING", "INFO", "DEBUG".'.format(log_level)) + warnings.warn( + 'Unknown level "{}" for logging.\n' + 'Please, use one of "CRITICAL", "ERROR", ' + '"WARNING", "INFO", "DEBUG".'.format(log_level) + ) def get_patch_message(s, queue=None, cpu_fallback=False): @@ -45,27 +48,29 @@ def get_patch_message(s, queue=None, cpu_fallback=False): message = "running accelerated version on " if queue is not None: if queue.sycl_device.is_gpu: - message += 'GPU' + message += "GPU" elif queue.sycl_device.is_cpu: - message += 'CPU' + message += "CPU" else: - raise RuntimeError('Unsupported device') + raise RuntimeError("Unsupported device") - elif 'daal4py.oneapi' in sys.modules: + elif "daal4py.oneapi" in sys.modules: from daal4py.oneapi import _get_device_name_sycl_ctxt + dev = _get_device_name_sycl_ctxt() - if dev == 'cpu' or dev is None: - message += 'CPU' - elif dev == 'gpu': + if dev == "cpu" or dev is None: + message += "CPU" + elif dev == "gpu": if cpu_fallback: - message += 'CPU' + message += "CPU" else: - message += 'GPU' + message += "GPU" else: - raise ValueError(f"Unexpected device name {dev}." - " Supported types are cpu and gpu") + raise ValueError( + f"Unexpected device name {dev}." " Supported types are cpu and gpu" + ) else: - message += 'CPU' + message += "CPU" elif s == "sklearn": message = "fallback to original Scikit-learn" @@ -74,7 +79,8 @@ def get_patch_message(s, queue=None, cpu_fallback=False): else: raise ValueError( f"Invalid input - expected one of 'onedal','sklearn'," - f" 'sklearn_after_onedal', got {s}") + f" 'sklearn_after_onedal', got {s}" + ) return message diff --git a/sklearnex/basic_statistics/__init__.py b/sklearnex/basic_statistics/__init__.py index 623c7cd83f..43c391c96f 100644 --- a/sklearnex/basic_statistics/__init__.py +++ b/sklearnex/basic_statistics/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 09e298c81b..f2b5b41694 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,6 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.basic_statistics import BasicStatistics diff --git a/sklearnex/cluster/__init__.py b/sklearnex/cluster/__init__.py index 3376349de3..81a8d7046d 100755 --- a/sklearnex/cluster/__init__.py +++ b/sklearnex/cluster/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,9 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .k_means import KMeans from .dbscan import DBSCAN +from .k_means import KMeans -__all__ = ['KMeans', 'DBSCAN'] +__all__ = ["KMeans", "DBSCAN"] diff --git a/sklearnex/cluster/dbscan.py b/sklearnex/cluster/dbscan.py index 31fea742f3..7e2dc8d1a7 100755 --- a/sklearnex/cluster/dbscan.py +++ b/sklearnex/cluster/dbscan.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.cluster import DBSCAN diff --git a/sklearnex/cluster/k_means.py b/sklearnex/cluster/k_means.py index 1f5b4556fc..50f70418d5 100755 --- a/sklearnex/cluster/k_means.py +++ b/sklearnex/cluster/k_means.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.cluster import KMeans diff --git a/sklearnex/cluster/tests/test_dbscan.py b/sklearnex/cluster/tests/test_dbscan.py index 5690c8427c..c1e5e7830f 100755 --- a/sklearnex/cluster/tests/test_dbscan.py +++ b/sklearnex/cluster/tests/test_dbscan.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,10 +21,10 @@ def test_sklearnex_import(): from sklearnex.cluster import DBSCAN - X = np.array([[1, 2], [2, 2], [2, 3], - [8, 7], [8, 8], [25, 80]]) + + X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]) dbscan = DBSCAN(eps=3, min_samples=2).fit(X) - assert 'daal4py' in dbscan.__module__ + assert "daal4py" in dbscan.__module__ result = dbscan.labels_ expected = np.array([0, 0, 0, 1, 1, -1], dtype=np.int32) diff --git a/sklearnex/cluster/tests/test_kmeans.py b/sklearnex/cluster/tests/test_kmeans.py index 78570349cf..69a8787bf4 100755 --- a/sklearnex/cluster/tests/test_kmeans.py +++ b/sklearnex/cluster/tests/test_kmeans.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,10 +21,10 @@ def test_sklearnex_import(): from sklearnex.cluster import KMeans - X = np.array([[1, 2], [1, 4], [1, 0], - [10, 2], [10, 4], [10, 0]]) + + X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) - assert 'daal4py' in kmeans.__module__ + assert "daal4py" in kmeans.__module__ result = kmeans.predict([[0, 0], [12, 3]]) expected = np.array([1, 0], dtype=np.int32) diff --git a/sklearnex/decomposition/__init__.py b/sklearnex/decomposition/__init__.py index ba84d03dc8..b9dadc237b 100755 --- a/sklearnex/decomposition/__init__.py +++ b/sklearnex/decomposition/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/sklearnex/decomposition/pca.py b/sklearnex/decomposition/pca.py index 317ac5bc7d..b0f374787d 100755 --- a/sklearnex/decomposition/pca.py +++ b/sklearnex/decomposition/pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.decomposition import PCA diff --git a/sklearnex/decomposition/tests/test_pca.py b/sklearnex/decomposition/tests/test_pca.py index 35c0e686d4..da9d3bc283 100755 --- a/sklearnex/decomposition/tests/test_pca.py +++ b/sklearnex/decomposition/tests/test_pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,7 +21,8 @@ def test_sklearnex_import(): from sklearnex.decomposition import PCA + X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) - pca = PCA(n_components=2, svd_solver='full').fit(X) - assert 'daal4py' in pca.__module__ + pca = PCA(n_components=2, svd_solver="full").fit(X) + assert "daal4py" in pca.__module__ assert_allclose(pca.singular_values_, [6.30061232, 0.54980396]) diff --git a/sklearnex/dispatcher.py b/sklearnex/dispatcher.py index 8ef12576ea..d19a4d4786 100644 --- a/sklearnex/dispatcher.py +++ b/sklearnex/dispatcher.py @@ -56,33 +56,49 @@ def get_patch_map(): from .neighbors import KNeighborsRegressor as KNeighborsRegressor_sklearnex from .neighbors import LocalOutlierFactor as LocalOutlierFactor_sklearnex from .neighbors import NearestNeighbors as NearestNeighbors_sklearnex - from .svm import SVC as SVC_sklearnex - from .svm import SVR as SVR_sklearnex - from .svm import NuSVC as NuSVC_sklearnex - from .svm import NuSVR as NuSVR_sklearnex # Preview classes for patching from .preview.cluster import KMeans as KMeans_sklearnex from .preview.decomposition import PCA as PCA_sklearnex - from .preview.linear_model import LinearRegression as LinearRegression_sklearnex from .preview.ensemble import ( ExtraTreesClassifier as ExtraTreesClassifier_sklearnex, - ExtraTreesRegressor as ExtraTreesRegressor_sklearnex, + ) + from .preview.ensemble import ExtraTreesRegressor as ExtraTreesRegressor_sklearnex + from .preview.ensemble import ( RandomForestClassifier as RandomForestClassifier_sklearnex, + ) + from .preview.ensemble import ( RandomForestRegressor as RandomForestRegressor_sklearnex, ) + from .preview.linear_model import LinearRegression as LinearRegression_sklearnex + from .svm import SVC as SVC_sklearnex + from .svm import SVR as SVR_sklearnex + from .svm import NuSVC as NuSVC_sklearnex + from .svm import NuSVR as NuSVR_sklearnex # Patch for mapping if _is_preview_enabled(): # Ensemble - mapping["extra_trees_classifier"] = [[(ensemble_module, - "ExtraTreesClassifier", - ExtraTreesClassifier_sklearnex), - None]] - mapping["extra_trees_regressor"] = [[(ensemble_module, - "ExtraTreesRegressor", - ExtraTreesRegressor_sklearnex), - None]] + mapping["extra_trees_classifier"] = [ + [ + ( + ensemble_module, + "ExtraTreesClassifier", + ExtraTreesClassifier_sklearnex, + ), + None, + ] + ] + mapping["extra_trees_regressor"] = [ + [ + ( + ensemble_module, + "ExtraTreesRegressor", + ExtraTreesRegressor_sklearnex, + ), + None, + ] + ] mapping["extratreesclassifier"] = mapping["extra_trees_classifier"] mapping["extratreesregressor"] = mapping["extra_trees_regressor"] mapping.pop("random_forest_classifier") @@ -239,9 +255,7 @@ def patch_sklearn(name=None, verbose=True, global_patch=False, preview=False): algorithm, verbose=False, deprecation=False, get_map=get_patch_map ) else: - patch_sklearn_orig( - name, verbose=False, deprecation=False, get_map=get_patch_map - ) + patch_sklearn_orig(name, verbose=False, deprecation=False, get_map=get_patch_map) if verbose and sys.stderr is not None: sys.stderr.write( @@ -288,9 +302,7 @@ def sklearn_is_patched(name=None, return_map=False): ) return is_patched else: - return sklearn_is_patched_orig( - name, get_map=get_patch_map, return_map=return_map - ) + return sklearn_is_patched_orig(name, get_map=get_patch_map, return_map=return_map) def is_patched_instance(instance: object) -> bool: diff --git a/sklearnex/ensemble/__init__.py b/sklearnex/ensemble/__init__.py index e1102d00d9..5c40aa4974 100644 --- a/sklearnex/ensemble/__init__.py +++ b/sklearnex/ensemble/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor'] +__all__ = ["RandomForestClassifier", "RandomForestRegressor"] diff --git a/sklearnex/ensemble/forest.py b/sklearnex/ensemble/forest.py index 5ad02b46cd..d15e32ce6d 100644 --- a/sklearnex/ensemble/forest.py +++ b/sklearnex/ensemble/forest.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.ensemble import RandomForestClassifier, RandomForestRegressor diff --git a/sklearnex/ensemble/tests/test_forest.py b/sklearnex/ensemble/tests/test_forest.py index 6437496aae..309b3658a1 100644 --- a/sklearnex/ensemble/tests/test_forest.py +++ b/sklearnex/ensemble/tests/test_forest.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,34 +13,42 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose from sklearn.datasets import make_classification, make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_rf_classifier(): from sklearnex.ensemble import RandomForestClassifier - X, y = make_classification(n_samples=1000, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) + + X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) rf = RandomForestClassifier(max_depth=2, random_state=0).fit(X, y) - assert 'daal4py' in rf.__module__ + assert "daal4py" in rf.__module__ assert_allclose([1], rf.predict([[0, 0, 0, 0]])) def test_sklearnex_import_rf_regression(): from sklearnex.ensemble import RandomForestRegressor - X, y = make_regression(n_features=4, n_informative=2, - random_state=0, shuffle=False) + + X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y) - assert 'daal4py' in rf.__module__ + assert "daal4py" in rf.__module__ pred = rf.predict([[0, 0, 0, 0]]) - if daal_check_version((2021, 'P', 400)): + if daal_check_version((2021, "P", 400)): # random engine work was changed in sklearnex 2023.1 - assert np.allclose([-6.97], pred, atol=1e-2) \ - or np.allclose([-8.36], pred, atol=1e-2) + assert np.allclose([-6.97], pred, atol=1e-2) or np.allclose( + [-8.36], pred, atol=1e-2 + ) else: assert_allclose([-6.66], pred, atol=1e-2) diff --git a/sklearnex/glob/__main__.py b/sklearnex/glob/__main__.py index 7712a587d6..de51b784e3 100755 --- a/sklearnex/glob/__main__.py +++ b/sklearnex/glob/__main__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from sklearnex import patch_sklearn -from sklearnex import unpatch_sklearn +from sklearnex import patch_sklearn, unpatch_sklearn def _main(): @@ -34,15 +33,29 @@ def __call__(self, parser, namespace, values, option_string=None): description=""" Patch all your Scikit-learn applications using Intel(R) Extension for scikit-learn.""", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.register('action', 'extend', ExtendAction) - parser.add_argument('action', choices=["patch_sklearn", "unpatch_sklearn"], - help="Enable or Disable patching") - parser.add_argument('--no-verbose', '-nv', action='store_false', - help="Disable additional information about enabling patching") - parser.add_argument('--algorithm', '-a', action='extend', type=str, nargs="+", - help="The name of an algorithm to be patched globally") + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.register("action", "extend", ExtendAction) + parser.add_argument( + "action", + choices=["patch_sklearn", "unpatch_sklearn"], + help="Enable or Disable patching", + ) + parser.add_argument( + "--no-verbose", + "-nv", + action="store_false", + help="Disable additional information about enabling patching", + ) + parser.add_argument( + "--algorithm", + "-a", + action="extend", + type=str, + nargs="+", + help="The name of an algorithm to be patched globally", + ) args = parser.parse_args() if args.action == "patch_sklearn": @@ -50,9 +63,11 @@ def __call__(self, parser, namespace, values, option_string=None): elif args.action == "unpatch_sklearn": unpatch_sklearn(global_unpatch=True) else: - raise RuntimeError("Invalid choice for the action attribute." - " Expected: patch_sklearn or unpatch_sklearn." - f" Got {args.action}") + raise RuntimeError( + "Invalid choice for the action attribute." + " Expected: patch_sklearn or unpatch_sklearn." + f" Got {args.action}" + ) _main() diff --git a/sklearnex/glob/dispatcher.py b/sklearnex/glob/dispatcher.py index 7633832921..631e51c907 100755 --- a/sklearnex/glob/dispatcher.py +++ b/sklearnex/glob/dispatcher.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + def get_patch_str(name=None, verbose=True): return f"""try: @@ -36,46 +37,52 @@ def get_patch_str_re(): def patch_sklearn_global(name=None, verbose=True): import os import re + try: import sklearn except ImportError: raise ImportError("Scikit-learn could not be imported. Nothing to patch\n") init_file_path = sklearn.__file__ - distributor_file_path = os.path.join(os.path.dirname(init_file_path), - "_distributor_init.py") + distributor_file_path = os.path.join( + os.path.dirname(init_file_path), "_distributor_init.py" + ) - with open(distributor_file_path, 'r', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "r", encoding="utf-8") as distributor_file: lines = distributor_file.read() if re.search(get_patch_str_re(), lines): - lines = re.sub(get_patch_str_re(), '', lines) + lines = re.sub(get_patch_str_re(), "", lines) - with open(distributor_file_path, 'w', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "w", encoding="utf-8") as distributor_file: distributor_file.write(lines + "\n" + get_patch_str(name, verbose) + "\n") - print("Scikit-learn was successfully globally patched" - " by Intel(R) Extension for Scikit-learn") + print( + "Scikit-learn was successfully globally patched" + " by Intel(R) Extension for Scikit-learn" + ) return def unpatch_sklearn_global(): import os import re + try: import sklearn except ImportError: raise ImportError("Scikit-learn could not be imported. Nothing to unpatch\n") init_file_path = sklearn.__file__ - distributor_file_path = os.path.join(os.path.dirname(init_file_path), - "_distributor_init.py") + distributor_file_path = os.path.join( + os.path.dirname(init_file_path), "_distributor_init.py" + ) - with open(distributor_file_path, 'r', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "r", encoding="utf-8") as distributor_file: lines = distributor_file.read() if not re.search(get_patch_str_re(), lines): print("Nothing to unpatch: Scikit-learn is not patched\n") return - lines = re.sub(get_patch_str_re(), '', lines) + lines = re.sub(get_patch_str_re(), "", lines) - with open(distributor_file_path, 'w', encoding='utf-8') as distributor_file: + with open(distributor_file_path, "w", encoding="utf-8") as distributor_file: distributor_file.write(lines) print("Scikit-learn was successfully globally unpatched") diff --git a/sklearnex/linear_model/__init__.py b/sklearnex/linear_model/__init__.py index d04e9cb3cf..012522ca82 100755 --- a/sklearnex/linear_model/__init__.py +++ b/sklearnex/linear_model/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,18 +13,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== +from .coordinate_descent import ElasticNet, Lasso from .linear import LinearRegression -from .logistic_path import logistic_regression_path, LogisticRegression +from .logistic_path import LogisticRegression, logistic_regression_path from .ridge import Ridge -from .coordinate_descent import ElasticNet, Lasso __all__ = [ - 'Ridge', - 'LinearRegression', - 'LogisticRegression', - 'logistic_regression_path', - 'ElasticNet', - 'Lasso' + "Ridge", + "LinearRegression", + "LogisticRegression", + "logistic_regression_path", + "ElasticNet", + "Lasso", ] diff --git a/sklearnex/linear_model/coordinate_descent.py b/sklearnex/linear_model/coordinate_descent.py index efc8e72e20..731de3dc09 100644 --- a/sklearnex/linear_model/coordinate_descent.py +++ b/sklearnex/linear_model/coordinate_descent.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.linear_model import ElasticNet, Lasso diff --git a/sklearnex/linear_model/logistic_path.py b/sklearnex/linear_model/logistic_path.py index ee852748f6..b9274f76f5 100644 --- a/sklearnex/linear_model/logistic_path.py +++ b/sklearnex/linear_model/logistic_path.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn.linear_model import logistic_regression_path, LogisticRegression +from daal4py.sklearn.linear_model import LogisticRegression, logistic_regression_path diff --git a/sklearnex/linear_model/ridge.py b/sklearnex/linear_model/ridge.py index 81a83c5f98..6c00cee3a3 100644 --- a/sklearnex/linear_model/ridge.py +++ b/sklearnex/linear_model/ridge.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.linear_model import Ridge diff --git a/sklearnex/linear_model/tests/test_linear.py b/sklearnex/linear_model/tests/test_linear.py index 7a1e3f52d8..3b8dd9d3ab 100755 --- a/sklearnex/linear_model/tests/test_linear.py +++ b/sklearnex/linear_model/tests/test_linear.py @@ -18,44 +18,49 @@ import numpy as np from numpy.testing import assert_allclose from sklearn.datasets import make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_linear(): from sklearnex.linear_model import LinearRegression + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 linreg = LinearRegression().fit(X, y) - assert 'daal4py' in linreg.__module__ + assert "daal4py" in linreg.__module__ assert linreg.n_features_in_ == 2 - assert_allclose(linreg.intercept_, 3.) - assert_allclose(linreg.coef_, [1., 2.]) + assert_allclose(linreg.intercept_, 3.0) + assert_allclose(linreg.coef_, [1.0, 2.0]) def test_sklearnex_import_ridge(): from sklearnex.linear_model import Ridge + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 ridgereg = Ridge().fit(X, y) - assert 'daal4py' in ridgereg.__module__ + assert "daal4py" in ridgereg.__module__ assert_allclose(ridgereg.intercept_, 4.5) assert_allclose(ridgereg.coef_, [0.8, 1.4]) def test_sklearnex_import_lasso(): from sklearnex.linear_model import Lasso + X = [[0, 0], [1, 1], [2, 2]] y = [0, 1, 2] lasso = Lasso(alpha=0.1).fit(X, y) - assert 'daal4py' in lasso.__module__ + assert "daal4py" in lasso.__module__ assert_allclose(lasso.intercept_, 0.15) assert_allclose(lasso.coef_, [0.85, 0.0]) def test_sklearnex_import_elastic(): from sklearnex.linear_model import ElasticNet + X, y = make_regression(n_features=2, random_state=0) elasticnet = ElasticNet(random_state=0).fit(X, y) - assert 'daal4py' in elasticnet.__module__ + assert "daal4py" in elasticnet.__module__ assert_allclose(elasticnet.intercept_, 1.451, atol=1e-3) assert_allclose(elasticnet.coef_, [18.838, 64.559], atol=1e-3) diff --git a/sklearnex/linear_model/tests/test_logreg.py b/sklearnex/linear_model/tests/test_logreg.py index 35489b0eff..c361a09a48 100755 --- a/sklearnex/linear_model/tests/test_logreg.py +++ b/sklearnex/linear_model/tests/test_logreg.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -22,7 +22,8 @@ def test_sklearnex_import(): from sklearnex.linear_model import LogisticRegression + X, y = load_iris(return_X_y=True) logreg = LogisticRegression(random_state=0, max_iter=200).fit(X, y) - assert 'daal4py' in logreg.__module__ + assert "daal4py" in logreg.__module__ assert_allclose(logreg.score(X, y), 0.9733, atol=1e-3) diff --git a/sklearnex/manifold/__init__.py b/sklearnex/manifold/__init__.py index 6310727ed0..9c9fda72f6 100755 --- a/sklearnex/manifold/__init__.py +++ b/sklearnex/manifold/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .t_sne import TSNE -__all__ = ['TSNE'] +__all__ = ["TSNE"] diff --git a/sklearnex/manifold/t_sne.py b/sklearnex/manifold/t_sne.py index 000e1406d6..bb1b72f48c 100755 --- a/sklearnex/manifold/t_sne.py +++ b/sklearnex/manifold/t_sne.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.manifold import TSNE diff --git a/sklearnex/manifold/tests/test_tsne.py b/sklearnex/manifold/tests/test_tsne.py index 159cebeba0..a5e5027d97 100755 --- a/sklearnex/manifold/tests/test_tsne.py +++ b/sklearnex/manifold/tests/test_tsne.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,6 +21,7 @@ def test_sklearnex_import(): from sklearnex.manifold import TSNE + X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) tsne = TSNE(n_components=2, perplexity=2.0).fit(X) - assert 'daal4py' in tsne.__module__ + assert "daal4py" in tsne.__module__ diff --git a/sklearnex/metrics/__init__.py b/sklearnex/metrics/__init__.py index 1d3a5b8021..37724a8557 100755 --- a/sklearnex/metrics/__init__.py +++ b/sklearnex/metrics/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,12 +13,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from .ranking import roc_auc_score from .pairwise import pairwise_distances +from .ranking import roc_auc_score __all__ = [ - 'roc_auc_score', - 'pairwise_distances', + "roc_auc_score", + "pairwise_distances", ] diff --git a/sklearnex/metrics/pairwise.py b/sklearnex/metrics/pairwise.py index 25f32b5d83..938bad4dd4 100755 --- a/sklearnex/metrics/pairwise.py +++ b/sklearnex/metrics/pairwise.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.metrics import pairwise_distances diff --git a/sklearnex/metrics/ranking.py b/sklearnex/metrics/ranking.py index 8982be4e69..14762dd3ef 100755 --- a/sklearnex/metrics/ranking.py +++ b/sklearnex/metrics/ranking.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.metrics import roc_auc_score diff --git a/sklearnex/metrics/tests/test_metrics.py b/sklearnex/metrics/tests/test_metrics.py index cf1d6bda93..85ac15cb3a 100755 --- a/sklearnex/metrics/tests/test_metrics.py +++ b/sklearnex/metrics/tests/test_metrics.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,18 +21,20 @@ def test_sklearnex_import_roc_auc(): - from sklearnex.metrics import roc_auc_score from sklearnex.linear_model import LogisticRegression + from sklearnex.metrics import roc_auc_score + X, y = load_breast_cancer(return_X_y=True) - clf = LogisticRegression(solver='liblinear', random_state=0).fit(X, y) + clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y) res = roc_auc_score(y, clf.decision_function(X)) assert_allclose(res, 0.99, atol=1e-2) def test_sklearnex_import_pairwise_distances(): from sklearnex.metrics import pairwise_distances + rng = np.random.RandomState(0) x = np.abs(rng.rand(4), dtype=np.float64) x = np.vstack([x, x]) - res = pairwise_distances(x, metric='cosine') - assert_allclose(res, [[0., 0.], [0., 0.]], atol=1e-2) + res = pairwise_distances(x, metric="cosine") + assert_allclose(res, [[0.0, 0.0], [0.0, 0.0]], atol=1e-2) diff --git a/sklearnex/model_selection/__init__.py b/sklearnex/model_selection/__init__.py index b96bd0f4ab..99222cd7f1 100755 --- a/sklearnex/model_selection/__init__.py +++ b/sklearnex/model_selection/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .split import train_test_split __all__ = [ - 'train_test_split', + "train_test_split", ] diff --git a/sklearnex/model_selection/split.py b/sklearnex/model_selection/split.py index d2278382f2..cd00f112ab 100755 --- a/sklearnex/model_selection/split.py +++ b/sklearnex/model_selection/split.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.model_selection import train_test_split diff --git a/sklearnex/model_selection/tests/test_model_selection.py b/sklearnex/model_selection/tests/test_model_selection.py index 1e12c53461..78af6b12e0 100755 --- a/sklearnex/model_selection/tests/test_model_selection.py +++ b/sklearnex/model_selection/tests/test_model_selection.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,10 +21,11 @@ def test_sklearnex_import_train_test_split(): from sklearnex.model_selection import train_test_split + X = np.arange(100).reshape((10, 10)) y = np.arange(10) - split = train_test_split(X, y, test_size=None, train_size=.5) + split = train_test_split(X, y, test_size=None, train_size=0.5) X_train, X_test, y_train, y_test = split assert len(y_test) == len(y_train) diff --git a/sklearnex/neighbors/__init__.py b/sklearnex/neighbors/__init__.py index c84dcd238e..1f9d31c88e 100755 --- a/sklearnex/neighbors/__init__.py +++ b/sklearnex/neighbors/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,12 +13,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .knn_classification import KNeighborsClassifier -from .knn_unsupervised import NearestNeighbors from .knn_regression import KNeighborsRegressor +from .knn_unsupervised import NearestNeighbors from .lof import LocalOutlierFactor -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'LocalOutlierFactor', - 'NearestNeighbors'] +__all__ = [ + "KNeighborsClassifier", + "KNeighborsRegressor", + "LocalOutlierFactor", + "NearestNeighbors", +] diff --git a/sklearnex/neighbors/common.py b/sklearnex/neighbors/common.py index c3d5aad234..e12056d56f 100644 --- a/sklearnex/neighbors/common.py +++ b/sklearnex/neighbors/common.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,20 +13,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version -from onedal.utils import _check_array, _num_features, _num_samples +import warnings import numpy as np from scipy import sparse as sp -import warnings - +from sklearn.neighbors._ball_tree import BallTree from sklearn.neighbors._base import VALID_METRICS from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase -from sklearn.neighbors._ball_tree import BallTree from sklearn.neighbors._kd_tree import KDTree +from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version +from onedal.utils import _check_array, _num_features, _num_samples + class KNeighborsDispatchingBase: def _fit_validation(self, X, y=None): @@ -34,11 +34,15 @@ def _fit_validation(self, X, y=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - if self.metric_params is not None and 'p' in self.metric_params: + if self.metric_params is not None and "p" in self.metric_params: if self.p is not None: - warnings.warn("Parameter p is found in metric_params. " - "The corresponding parameter from __init__ " - "is ignored.", SyntaxWarning, stacklevel=2) + warnings.warn( + "Parameter p is found in metric_params. " + "The corresponding parameter from __init__ " + "is ignored.", + SyntaxWarning, + stacklevel=2, + ) self.effective_metric_params_ = self.metric_params.copy() effective_p = self.metric_params["p"] else: @@ -59,31 +63,35 @@ def _fit_validation(self, X, y=None): if not isinstance(X, (KDTree, BallTree, sklearn_NeighborsBase)): self._fit_X = _check_array( - X, dtype=[np.float64, np.float32], accept_sparse=True) + X, dtype=[np.float64, np.float32], accept_sparse=True + ) self.n_samples_fit_ = _num_samples(self._fit_X) self.n_features_in_ = _num_features(self._fit_X) if self.algorithm == "auto": # A tree approach is better for small number of neighbors or small # number of features, with KDTree generally faster when available - is_n_neighbors_valid_for_brute = self.n_neighbors is not None and \ - self.n_neighbors >= self._fit_X.shape[0] // 2 + is_n_neighbors_valid_for_brute = ( + self.n_neighbors is not None + and self.n_neighbors >= self._fit_X.shape[0] // 2 + ) if self._fit_X.shape[1] > 15 or is_n_neighbors_valid_for_brute: self._fit_method = "brute" else: if self.effective_metric_ in VALID_METRICS["kd_tree"]: self._fit_method = "kd_tree" - elif callable(self.effective_metric_) or \ - self.effective_metric_ in \ - VALID_METRICS["ball_tree"]: + elif ( + callable(self.effective_metric_) + or self.effective_metric_ in VALID_METRICS["ball_tree"] + ): self._fit_method = "ball_tree" else: self._fit_method = "brute" else: self._fit_method = self.algorithm - if hasattr(self, '_onedal_estimator'): - delattr(self, '_onedal_estimator') + if hasattr(self, "_onedal_estimator"): + delattr(self, "_onedal_estimator") # To cover test case when we pass patched # estimator as an input for other estimator if isinstance(X, sklearn_NeighborsBase): @@ -92,8 +100,8 @@ def _fit_validation(self, X, y=None): self._fit_method = X._fit_method self.n_samples_fit_ = X.n_samples_fit_ self.n_features_in_ = X.n_features_in_ - if hasattr(X, '_onedal_estimator'): - self.effective_metric_params_.pop('p') + if hasattr(X, "_onedal_estimator"): + self.effective_metric_params_.pop("p") if self._fit_method == "ball_tree": X._tree = BallTree( X._fit_X, @@ -116,58 +124,63 @@ def _fit_validation(self, X, y=None): elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X - self._fit_method = 'ball_tree' + self._fit_method = "ball_tree" self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X - self._fit_method = 'kd_tree' + self._fit_method = "kd_tree" self.n_samples_fit_ = X.data.shape[0] self.n_features_in_ = X.data.shape[1] def _onedal_supported(self, device, method_name, *data): class_name = self.__class__.__name__ - is_classifier = 'Classifier' in class_name - is_regressor = 'Regressor' in class_name + is_classifier = "Classifier" in class_name + is_regressor = "Regressor" in class_name is_unsupervised = not (is_classifier or is_regressor) patching_status = PatchingConditionsChain( - f'sklearn.neighbors.{class_name}.{method_name}') + f"sklearn.neighbors.{class_name}.{method_name}" + ) if not patching_status.and_condition( not isinstance(data[0], (KDTree, BallTree, sklearn_NeighborsBase)), - f'Input type {type(data[0])} is not supported.' + f"Input type {type(data[0])} is not supported.", ): return patching_status.get_status(logs=True) - if self._fit_method in ['auto', 'ball_tree']: - condition = self.n_neighbors is not None and \ - self.n_neighbors >= self.n_samples_fit_ // 2 + if self._fit_method in ["auto", "ball_tree"]: + condition = ( + self.n_neighbors is not None + and self.n_neighbors >= self.n_samples_fit_ // 2 + ) if self.n_features_in_ > 15 or condition: - result_method = 'brute' + result_method = "brute" else: - if self.effective_metric_ in ['euclidean']: - result_method = 'kd_tree' + if self.effective_metric_ in ["euclidean"]: + result_method = "kd_tree" else: - result_method = 'brute' + result_method = "brute" else: result_method = self._fit_method - p_less_than_one = "p" in self.effective_metric_params_.keys() and \ - self.effective_metric_params_["p"] < 1 + p_less_than_one = ( + "p" in self.effective_metric_params_.keys() + and self.effective_metric_params_["p"] < 1 + ) if not patching_status.and_condition( not p_less_than_one, '"p" metric parameter is less than 1' ): return patching_status.get_status(logs=True) if not patching_status.and_condition( - not sp.isspmatrix(data[0]), 'Sparse input is not supported.' + not sp.isspmatrix(data[0]), "Sparse input is not supported." ): return patching_status.get_status(logs=True) if not is_unsupervised: - is_valid_weights = self.weights in ['uniform', "distance"] + is_valid_weights = self.weights in ["uniform", "distance"] if is_classifier: class_count = 1 is_single_output = False @@ -177,65 +190,73 @@ def _onedal_supported(self, device, method_name, *data): y = np.asarray(data[1]) if is_classifier: class_count = len(np.unique(y)) - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): y = self._onedal_estimator._y - if y is not None and hasattr(y, 'ndim') and hasattr(y, 'shape'): + if y is not None and hasattr(y, "ndim") and hasattr(y, "shape"): is_single_output = y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1 # TODO: add native support for these metric names - metrics_map = { - 'manhattan': ['l1', 'cityblock'], - 'euclidean': ['l2'] - } + metrics_map = {"manhattan": ["l1", "cityblock"], "euclidean": ["l2"]} for origin, aliases in metrics_map.items(): if self.effective_metric_ in aliases: self.effective_metric_ = origin break - if self.effective_metric_ == 'manhattan': - self.effective_metric_params_['p'] = 1 - elif self.effective_metric_ == 'euclidean': - self.effective_metric_params_['p'] = 2 + if self.effective_metric_ == "manhattan": + self.effective_metric_params_["p"] = 1 + elif self.effective_metric_ == "euclidean": + self.effective_metric_params_["p"] = 2 onedal_brute_metrics = [ - 'manhattan', 'minkowski', 'euclidean', 'chebyshev', 'cosine'] - onedal_kdtree_metrics = ['euclidean'] - is_valid_for_brute = result_method == 'brute' and \ - self.effective_metric_ in onedal_brute_metrics - is_valid_for_kd_tree = result_method == 'kd_tree' and \ - self.effective_metric_ in onedal_kdtree_metrics - if result_method == 'kd_tree': + "manhattan", + "minkowski", + "euclidean", + "chebyshev", + "cosine", + ] + onedal_kdtree_metrics = ["euclidean"] + is_valid_for_brute = ( + result_method == "brute" and self.effective_metric_ in onedal_brute_metrics + ) + is_valid_for_kd_tree = ( + result_method == "kd_tree" and self.effective_metric_ in onedal_kdtree_metrics + ) + if result_method == "kd_tree": if not patching_status.and_condition( - device != 'gpu', '"kd_tree" method is not supported on GPU.' + device != "gpu", '"kd_tree" method is not supported on GPU.' ): return patching_status.get_status(logs=True) if not patching_status.and_condition( is_valid_for_kd_tree or is_valid_for_brute, - f'{result_method} with {self.effective_metric_} metric is not supported.' + f"{result_method} with {self.effective_metric_} metric is not supported.", ): return patching_status.get_status(logs=True) if not is_unsupervised: - if not patching_status.and_conditions([ - (is_single_output, 'Only single output is supported.'), - (is_valid_weights, - f'"{type(self.weights)}" weights type is not supported.') - ]): + if not patching_status.and_conditions( + [ + (is_single_output, "Only single output is supported."), + ( + is_valid_weights, + f'"{type(self.weights)}" weights type is not supported.', + ), + ] + ): return patching_status.get_status(logs=True) - if method_name == 'fit': + if method_name == "fit": if is_classifier: patching_status.and_condition( - class_count >= 2, 'One-class case is not supported.' + class_count >= 2, "One-class case is not supported." ) return patching_status.get_status(logs=True) - if method_name in ['predict', 'predict_proba', 'kneighbors']: + if method_name in ["predict", "predict_proba", "kneighbors"]: patching_status.and_condition( - hasattr(self, '_onedal_estimator'), 'oneDAL model was not trained.' + hasattr(self, "_onedal_estimator"), "oneDAL model was not trained." ) return patching_status.get_status(logs=True) - raise RuntimeError(f'Unknown method {method_name} in {class_name}') + raise RuntimeError(f"Unknown method {method_name} in {class_name}") def _onedal_gpu_supported(self, method_name, *data): - return self._onedal_supported('gpu', method_name, *data) + return self._onedal_supported("gpu", method_name, *data) def _onedal_cpu_supported(self, method_name, *data): - return self._onedal_supported('cpu', method_name, *data) + return self._onedal_supported("cpu", method_name, *data) diff --git a/sklearnex/neighbors/knn_classification.py b/sklearnex/neighbors/knn_classification.py index 3762cded9e..423345ed1e 100755 --- a/sklearnex/neighbors/knn_classification.py +++ b/sklearnex/neighbors/knn_classification.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,125 +13,203 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import sklearn_check_version import warnings -from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._kd_tree import KDTree -if not sklearn_check_version('1.2'): + +from daal4py.sklearn._utils import sklearn_check_version + +if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights + +import numpy as np from sklearn.neighbors._base import VALID_METRICS -from sklearn.neighbors._classification import KNeighborsClassifier as \ - sklearn_KNeighborsClassifier -from sklearn.neighbors._unsupervised import NearestNeighbors as \ - sklearn_NearestNeighbors +from sklearn.neighbors._classification import ( + KNeighborsClassifier as sklearn_KNeighborsClassifier, +) +from sklearn.neighbors._unsupervised import NearestNeighbors as sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted -from onedal.utils import _check_array, _num_features, _num_samples from onedal.neighbors import KNeighborsClassifier as onedal_KNeighborsClassifier +from onedal.utils import _check_array, _num_features, _num_samples -from .common import KNeighborsDispatchingBase from .._device_offload import dispatch, wrap_output_data -import numpy as np - +from .common import KNeighborsDispatchingBase if sklearn_check_version("0.24"): + class KNeighborsClassifier_(sklearn_KNeighborsClassifier): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_KNeighborsClassifier._parameter_constraints} + **sklearn_KNeighborsClassifier._parameter_constraints + } @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) - self.weights = \ - weights if sklearn_check_version("1.0") \ - else _check_weights(weights) + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( + weights if sklearn_check_version("1.0") else _check_weights(weights) + ) + elif sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors._base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(sklearn_KNeighborsClassifier, - BaseSupervisedIntegerMixin): + class KNeighborsClassifier_(sklearn_KNeighborsClassifier, BaseSupervisedIntegerMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) + else: - from sklearn.neighbors.base import SupervisedIntegerMixin as \ - BaseSupervisedIntegerMixin + from sklearn.neighbors.base import ( + SupervisedIntegerMixin as BaseSupervisedIntegerMixin, + ) - class KNeighborsClassifier_(sklearn_KNeighborsClassifier, - BaseSupervisedIntegerMixin): + class KNeighborsClassifier_(sklearn_KNeighborsClassifier, BaseSupervisedIntegerMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) class KNeighborsClassifier(KNeighborsClassifier_, KNeighborsDispatchingBase): - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **KNeighborsClassifier_._parameter_constraints} - - if sklearn_check_version('1.0'): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None): + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**KNeighborsClassifier_._parameter_constraints} + + if sklearn_check_version("1.0"): + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs) + n_jobs=n_jobs, + ) + else: + @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) def fit(self, X, y): self._fit_validation(X, y) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_KNeighborsClassifier.fit, - }, X, y) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_KNeighborsClassifier.fit, + }, + X, + y, + ) return self @wrap_output_data @@ -139,58 +217,81 @@ def predict(self, X): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_KNeighborsClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KNeighborsClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_KNeighborsClassifier.predict_proba, - }, X) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_KNeighborsClassifier.predict_proba, + }, + X, + ) @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'kneighbors', { - 'onedal': self.__class__._onedal_kneighbors, - 'sklearn': sklearn_KNeighborsClassifier.kneighbors, - }, X, n_neighbors, return_distance) + return dispatch( + self, + "kneighbors", + { + "onedal": self.__class__._onedal_kneighbors, + "sklearn": sklearn_KNeighborsClassifier.kneighbors, + }, + X, + n_neighbors, + return_distance, + ) @wrap_output_data - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - _onedal_estimator = getattr(self, '_onedal_estimator', None) + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + _onedal_estimator = getattr(self, "_onedal_estimator", None) - if _onedal_estimator is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + if ( + _onedal_estimator is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, '_y', None)) + sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) else: sklearn_NearestNeighbors.fit(self, self._fit_X) if sklearn_check_version("0.22"): result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result def _onedal_fit(self, X, y, queue=None): onedal_params = { - 'n_neighbors': self.n_neighbors, - 'weights': self.weights, - 'algorithm': self.algorithm, - 'metric': self.effective_metric_, - 'p': self.effective_metric_params_['p'], + "n_neighbors": self.n_neighbors, + "weights": self.weights, + "algorithm": self.algorithm, + "metric": self.effective_metric_, + "p": self.effective_metric_params_["p"], } try: @@ -212,10 +313,12 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): return self._onedal_estimator.predict_proba(X, queue=queue) - def _onedal_kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def _onedal_kneighbors( + self, X=None, n_neighbors=None, return_distance=True, queue=None + ): return self._onedal_estimator.kneighbors( - X, n_neighbors, return_distance, queue=queue) + X, n_neighbors, return_distance, queue=queue + ) def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ diff --git a/sklearnex/neighbors/knn_regression.py b/sklearnex/neighbors/knn_regression.py index 33bf29a806..efd789f937 100755 --- a/sklearnex/neighbors/knn_regression.py +++ b/sklearnex/neighbors/knn_regression.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,125 +13,199 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import sklearn_check_version import warnings -from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._ball_tree import BallTree +from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase from sklearn.neighbors._kd_tree import KDTree -if not sklearn_check_version('1.2'): + +from daal4py.sklearn._utils import sklearn_check_version + +if not sklearn_check_version("1.2"): from sklearn.neighbors._base import _check_weights + +import numpy as np from sklearn.neighbors._base import VALID_METRICS -from sklearn.neighbors._regression import KNeighborsRegressor as \ - sklearn_KNeighborsRegressor -from sklearn.neighbors._unsupervised import NearestNeighbors as \ - sklearn_NearestNeighbors +from sklearn.neighbors._regression import ( + KNeighborsRegressor as sklearn_KNeighborsRegressor, +) +from sklearn.neighbors._unsupervised import NearestNeighbors as sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted -from onedal.utils import _check_array, _num_features, _num_samples from onedal.neighbors import KNeighborsRegressor as onedal_KNeighborsRegressor +from onedal.utils import _check_array, _num_features, _num_samples -from .common import KNeighborsDispatchingBase from .._device_offload import dispatch, wrap_output_data -import numpy as np - +from .common import KNeighborsDispatchingBase if sklearn_check_version("0.24"): + class KNeighborsRegressor_(sklearn_KNeighborsRegressor): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_KNeighborsRegressor._parameter_constraints} + **sklearn_KNeighborsRegressor._parameter_constraints + } @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) - self.weights = \ - weights if sklearn_check_version("1.0") \ - else _check_weights(weights) + n_jobs=n_jobs, + **kwargs, + ) + self.weights = ( + weights if sklearn_check_version("1.0") else _check_weights(weights) + ) + elif sklearn_check_version("0.22"): - from sklearn.neighbors._base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors._base import SupervisedFloatMixin as BaseSupervisedFloatMixin - class KNeighborsRegressor_(sklearn_KNeighborsRegressor, - BaseSupervisedFloatMixin): + class KNeighborsRegressor_(sklearn_KNeighborsRegressor, BaseSupervisedFloatMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) + else: - from sklearn.neighbors.base import SupervisedFloatMixin as \ - BaseSupervisedFloatMixin + from sklearn.neighbors.base import SupervisedFloatMixin as BaseSupervisedFloatMixin - class KNeighborsRegressor_(sklearn_KNeighborsRegressor, - BaseSupervisedFloatMixin): + class KNeighborsRegressor_(sklearn_KNeighborsRegressor, BaseSupervisedFloatMixin): @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) self.weights = _check_weights(weights) class KNeighborsRegressor(KNeighborsRegressor_, KNeighborsDispatchingBase): - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **KNeighborsRegressor_._parameter_constraints} - - if sklearn_check_version('1.0'): - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None): + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**KNeighborsRegressor_._parameter_constraints} + + if sklearn_check_version("1.0"): + + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs) + n_jobs=n_jobs, + ) + else: + @_deprecate_positional_args - def __init__(self, n_neighbors=5, *, - weights='uniform', algorithm='auto', leaf_size=30, - p=2, metric='minkowski', metric_params=None, n_jobs=None, - **kwargs): + def __init__( + self, + n_neighbors=5, + *, + weights="uniform", + algorithm="auto", + leaf_size=30, + p=2, + metric="minkowski", + metric_params=None, + n_jobs=None, + **kwargs, + ): super().__init__( n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, + leaf_size=leaf_size, + metric=metric, + p=p, metric_params=metric_params, - n_jobs=n_jobs, **kwargs) + n_jobs=n_jobs, + **kwargs, + ) def fit(self, X, y): self._fit_validation(X, y) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_KNeighborsRegressor.fit, - }, X, y) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_KNeighborsRegressor.fit, + }, + X, + y, + ) return self @wrap_output_data @@ -139,48 +213,66 @@ def predict(self, X): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_KNeighborsRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KNeighborsRegressor.predict, + }, + X, + ) @wrap_output_data def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'kneighbors', { - 'onedal': self.__class__._onedal_kneighbors, - 'sklearn': sklearn_KNeighborsRegressor.kneighbors, - }, X, n_neighbors, return_distance) + return dispatch( + self, + "kneighbors", + { + "onedal": self.__class__._onedal_kneighbors, + "sklearn": sklearn_KNeighborsRegressor.kneighbors, + }, + X, + n_neighbors, + return_distance, + ) @wrap_output_data - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - _onedal_estimator = getattr(self, '_onedal_estimator', None) + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + _onedal_estimator = getattr(self, "_onedal_estimator", None) - if _onedal_estimator is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + if ( + _onedal_estimator is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, '_y', None)) + sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) else: sklearn_NearestNeighbors.fit(self, self._fit_X) if sklearn_check_version("0.22"): result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result def _onedal_fit(self, X, y, queue=None): onedal_params = { - 'n_neighbors': self.n_neighbors, - 'weights': self.weights, - 'algorithm': self.algorithm, - 'metric': self.effective_metric_, - 'p': self.effective_metric_params_['p'], + "n_neighbors": self.n_neighbors, + "weights": self.weights, + "algorithm": self.algorithm, + "metric": self.effective_metric_, + "p": self.effective_metric_params_["p"], } try: @@ -199,10 +291,12 @@ def _onedal_fit(self, X, y, queue=None): def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) - def _onedal_kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def _onedal_kneighbors( + self, X=None, n_neighbors=None, return_distance=True, queue=None + ): return self._onedal_estimator.kneighbors( - X, n_neighbors, return_distance, queue=queue) + X, n_neighbors, return_distance, queue=queue + ) def _save_attributes(self): self.n_features_in_ = self._onedal_estimator.n_features_in_ diff --git a/sklearnex/neighbors/knn_unsupervised.py b/sklearnex/neighbors/knn_unsupervised.py index 6670d06ff6..f6c2cf503e 100755 --- a/sklearnex/neighbors/knn_unsupervised.py +++ b/sklearnex/neighbors/knn_unsupervised.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,85 +13,128 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== try: from packaging.version import Version except ImportError: from distutils.version import LooseVersion as Version -from sklearn import __version__ as sklearn_version -from daal4py.sklearn._utils import sklearn_check_version + import warnings -from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase +import numpy as np +from sklearn import __version__ as sklearn_version from sklearn.neighbors._ball_tree import BallTree -from sklearn.neighbors._kd_tree import KDTree from sklearn.neighbors._base import VALID_METRICS -from sklearn.neighbors._unsupervised import NearestNeighbors as \ - sklearn_NearestNeighbors - +from sklearn.neighbors._base import NeighborsBase as sklearn_NeighborsBase +from sklearn.neighbors._kd_tree import KDTree +from sklearn.neighbors._unsupervised import NearestNeighbors as sklearn_NearestNeighbors from sklearn.utils.validation import _deprecate_positional_args, check_is_fitted -from onedal.utils import _check_array, _num_features, _num_samples +from daal4py.sklearn._utils import sklearn_check_version from onedal.neighbors import NearestNeighbors as onedal_NearestNeighbors +from onedal.utils import _check_array, _num_features, _num_samples -from .common import KNeighborsDispatchingBase from .._device_offload import dispatch, wrap_output_data -import numpy as np +from .common import KNeighborsDispatchingBase +if sklearn_check_version("0.22") and Version(sklearn_version) < Version("0.23"): -if sklearn_check_version("0.22") and \ - Version(sklearn_version) < Version("0.23"): class NearestNeighbors_(sklearn_NearestNeighbors): - def __init__(self, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) + else: + class NearestNeighbors_(sklearn_NearestNeighbors): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_NearestNeighbors._parameter_constraints} + **sklearn_NearestNeighbors._parameter_constraints + } @_deprecate_positional_args - def __init__(self, *, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + *, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) class NearestNeighbors(NearestNeighbors_, KNeighborsDispatchingBase): - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **NearestNeighbors_._parameter_constraints} + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**NearestNeighbors_._parameter_constraints} @_deprecate_positional_args - def __init__(self, n_neighbors=5, radius=1.0, - algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=None): + def __init__( + self, + n_neighbors=5, + radius=1.0, + algorithm="auto", + leaf_size=30, + metric="minkowski", + p=2, + metric_params=None, + n_jobs=None, + ): super().__init__( n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, - leaf_size=leaf_size, metric=metric, p=p, - metric_params=metric_params, n_jobs=n_jobs) + leaf_size=leaf_size, + metric=metric, + p=p, + metric_params=metric_params, + n_jobs=n_jobs, + ) def fit(self, X, y=None): self._fit_validation(X, y) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_NearestNeighbors.fit, - }, X, None) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_NearestNeighbors.fit, + }, + X, + None, + ) return self @wrap_output_data @@ -99,37 +142,50 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if sklearn_check_version("1.0") and X is not None: self._check_feature_names(X, reset=False) - return dispatch(self, 'kneighbors', { - 'onedal': self.__class__._onedal_kneighbors, - 'sklearn': sklearn_NearestNeighbors.kneighbors, - }, X, n_neighbors, return_distance) + return dispatch( + self, + "kneighbors", + { + "onedal": self.__class__._onedal_kneighbors, + "sklearn": sklearn_NearestNeighbors.kneighbors, + }, + X, + n_neighbors, + return_distance, + ) @wrap_output_data - def radius_neighbors(self, X=None, radius=None, return_distance=True, - sort_results=False): - _onedal_estimator = getattr(self, '_onedal_estimator', None) - - if _onedal_estimator is not None or getattr(self, '_tree', 0) is None and \ - self._fit_method == 'kd_tree': + def radius_neighbors( + self, X=None, radius=None, return_distance=True, sort_results=False + ): + _onedal_estimator = getattr(self, "_onedal_estimator", None) + + if ( + _onedal_estimator is not None + or getattr(self, "_tree", 0) is None + and self._fit_method == "kd_tree" + ): if sklearn_check_version("0.24"): - sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, '_y', None)) + sklearn_NearestNeighbors.fit(self, self._fit_X, getattr(self, "_y", None)) else: sklearn_NearestNeighbors.fit(self, self._fit_X) if sklearn_check_version("0.22"): result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance, sort_results) + self, X, radius, return_distance, sort_results + ) else: result = sklearn_NearestNeighbors.radius_neighbors( - self, X, radius, return_distance) + self, X, radius, return_distance + ) return result def _onedal_fit(self, X, y=None, queue=None): onedal_params = { - 'n_neighbors': self.n_neighbors, - 'algorithm': self.algorithm, - 'metric': self.effective_metric_, - 'p': self.effective_metric_params_['p'], + "n_neighbors": self.n_neighbors, + "algorithm": self.algorithm, + "metric": self.effective_metric_, + "p": self.effective_metric_params_["p"], } try: @@ -148,10 +204,12 @@ def _onedal_fit(self, X, y=None, queue=None): def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) - def _onedal_kneighbors(self, X=None, n_neighbors=None, - return_distance=True, queue=None): + def _onedal_kneighbors( + self, X=None, n_neighbors=None, return_distance=True, queue=None + ): return self._onedal_estimator.kneighbors( - X, n_neighbors, return_distance, queue=queue) + X, n_neighbors, return_distance, queue=queue + ) def _save_attributes(self): self.classes_ = self._onedal_estimator.classes_ diff --git a/sklearnex/neighbors/lof.py b/sklearnex/neighbors/lof.py index b02f98c64d..720be45ab8 100644 --- a/sklearnex/neighbors/lof.py +++ b/sklearnex/neighbors/lof.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,13 +13,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import warnings -from sklearn.neighbors._lof import LocalOutlierFactor as \ - sklearn_LocalOutlierFactor +import numpy as np +from sklearn.neighbors._lof import LocalOutlierFactor as sklearn_LocalOutlierFactor + from .knn_unsupervised import NearestNeighbors try: @@ -27,18 +27,21 @@ except ImportError: pass -from sklearn.utils.validation import check_is_fitted from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted from daal4py.sklearn._utils import sklearn_check_version -from .._device_offload import dispatch, wrap_output_data + from .._config import config_context +from .._device_offload import dispatch, wrap_output_data if sklearn_check_version("1.0"): + class LocalOutlierFactor(sklearn_LocalOutlierFactor): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { - **sklearn_LocalOutlierFactor._parameter_constraints} + **sklearn_LocalOutlierFactor._parameter_constraints + } def __init__( self, @@ -62,7 +65,7 @@ def __init__( metric_params=metric_params, n_jobs=n_jobs, contamination=contamination, - novelty=novelty + novelty=novelty, ) def _fit(self, X, y, queue=None): @@ -76,7 +79,7 @@ def _fit(self, X, y, queue=None): metric=self.metric, p=self.p, metric_params=self.metric_params, - n_jobs=self.n_jobs + n_jobs=self.n_jobs, ) self._knn.fit(X) @@ -98,8 +101,9 @@ def _fit(self, X, y, queue=None): ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) - self._distances_fit_X_, _neighbors_indices_fit_X_ =\ - self._knn.kneighbors(n_neighbors=self.n_neighbors_) + self._distances_fit_X_, _neighbors_indices_fit_X_ = self._knn.kneighbors( + n_neighbors=self.n_neighbors_ + ) self._lrd = self._local_reachability_density( self._distances_fit_X_, _neighbors_indices_fit_X_ @@ -127,10 +131,16 @@ def _fit(self, X, y, queue=None): return self def fit(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.fit', { - 'onedal': self.__class__._fit, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor.fit", + { + "onedal": self.__class__._fit, + "sklearn": None, + }, + X, + y, + ) def _onedal_predict(self, X, queue=None): with config_context(target_offload=queue): @@ -148,10 +158,15 @@ def _onedal_predict(self, X, queue=None): @wrap_output_data def _predict(self, X=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": None, + }, + X, + ) def _score_samples(self, X, queue=None): with config_context(target_offload=queue): @@ -183,10 +198,15 @@ def _check_novelty_score_samples(self): @available_if(_check_novelty_score_samples) @wrap_output_data def score_samples(self, X): - return dispatch(self, 'neighbors.LocalOutlierFactor.score_samples', { - 'onedal': self.__class__._score_samples, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.score_samples", + { + "onedal": self.__class__._score_samples, + "sklearn": None, + }, + X, + ) def _check_novelty_fit_predict(self): if self.novelty: @@ -204,17 +224,25 @@ def _fit_predict(self, X, y, queue=None): @available_if(_check_novelty_fit_predict) @wrap_output_data def fit_predict(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.fit_predict', { - 'onedal': self.__class__._fit_predict, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor.fit_predict", + { + "onedal": self.__class__._fit_predict, + "sklearn": None, + }, + X, + y, + ) def _onedal_gpu_supported(self, method_name, *data): return True def _onedal_cpu_supported(self, method_name, *data): return True + else: + class LocalOutlierFactor(sklearn_LocalOutlierFactor): def __init__( self, @@ -238,7 +266,7 @@ def __init__( metric_params=metric_params, n_jobs=n_jobs, contamination=contamination, - novelty=novelty + novelty=novelty, ) def _fit(self, X, y=None, queue=None): @@ -250,7 +278,7 @@ def _fit(self, X, y=None, queue=None): metric=self.metric, p=self.p, metric_params=self.metric_params, - n_jobs=self.n_jobs + n_jobs=self.n_jobs, ) self._knn.fit(X) @@ -272,8 +300,9 @@ def _fit(self, X, y=None, queue=None): ) self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1)) - self._distances_fit_X_, _neighbors_indices_fit_X_ =\ - self._knn.kneighbors(n_neighbors=self.n_neighbors_) + self._distances_fit_X_, _neighbors_indices_fit_X_ = self._knn.kneighbors( + n_neighbors=self.n_neighbors_ + ) self._lrd = self._local_reachability_density( self._distances_fit_X_, _neighbors_indices_fit_X_ @@ -301,10 +330,16 @@ def _fit(self, X, y=None, queue=None): return self def fit(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.fit', { - 'onedal': self.__class__._fit, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor.fit", + { + "onedal": self.__class__._fit, + "sklearn": None, + }, + X, + y, + ) def _onedal_predict(self, X, queue=None): with config_context(target_offload=queue): @@ -322,10 +357,15 @@ def _onedal_predict(self, X, queue=None): @wrap_output_data def _predict(self, X=None): - return dispatch(self, 'neighbors.LocalOutlierFactor.predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": None, + }, + X, + ) def _onedal_score_samples(self, X, queue=None): with config_context(target_offload=queue): @@ -345,17 +385,24 @@ def _onedal_score_samples(self, X, queue=None): @wrap_output_data def _score_samples(self, X): if not self.novelty: - msg = ('score_samples is not available when novelty=False. The ' - 'scores of the training samples are always available ' - 'through the negative_outlier_factor_ attribute. Use ' - 'novelty=True if you want to use LOF for novelty detection ' - 'and compute score_samples for new unseen data.') + msg = ( + "score_samples is not available when novelty=False. The " + "scores of the training samples are always available " + "through the negative_outlier_factor_ attribute. Use " + "novelty=True if you want to use LOF for novelty detection " + "and compute score_samples for new unseen data." + ) raise AttributeError(msg) - return dispatch(self, 'neighbors.LocalOutlierFactor.score_samples', { - 'onedal': self.__class__._onedal_score_samples, - 'sklearn': None, - }, X) + return dispatch( + self, + "neighbors.LocalOutlierFactor.score_samples", + { + "onedal": self.__class__._onedal_score_samples, + "sklearn": None, + }, + X, + ) def _onedal_fit_predict(self, X, y, queue=None): with config_context(target_offload=queue): @@ -363,10 +410,16 @@ def _onedal_fit_predict(self, X, y, queue=None): @wrap_output_data def _fit_predict(self, X, y=None): - return dispatch(self, 'neighbors.LocalOutlierFactor._onedal_fit_predict', { - 'onedal': self.__class__._onedal_fit_predict, - 'sklearn': None, - }, X, y) + return dispatch( + self, + "neighbors.LocalOutlierFactor._onedal_fit_predict", + { + "onedal": self.__class__._onedal_fit_predict, + "sklearn": None, + }, + X, + y, + ) def _onedal_gpu_supported(self, method_name, *data): return True diff --git a/sklearnex/neighbors/tests/test_neighbors.py b/sklearnex/neighbors/tests/test_neighbors.py index e871dc9a3b..735f40a4b8 100755 --- a/sklearnex/neighbors/tests/test_neighbors.py +++ b/sklearnex/neighbors/tests/test_neighbors.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,37 +21,41 @@ def test_sklearnex_import_knn_classifier(): from sklearnex.neighbors import KNeighborsClassifier + X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] neigh = KNeighborsClassifier(n_neighbors=3).fit(X, y) - assert 'sklearnex' in neigh.__module__ + assert "sklearnex" in neigh.__module__ assert_allclose(neigh.predict([[1.1]]), [0]) def test_sklearnex_import_knn_regression(): from sklearnex.neighbors import KNeighborsRegressor + X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] neigh = KNeighborsRegressor(n_neighbors=2).fit(X, y) - assert 'sklearnex' in neigh.__module__ + assert "sklearnex" in neigh.__module__ assert_allclose(neigh.predict([[1.5]]), [0.5]) def test_sklearnex_import_nn(): from sklearnex.neighbors import NearestNeighbors + X = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] neigh = NearestNeighbors(n_neighbors=2).fit(X) - assert 'sklearnex' in neigh.__module__ + assert "sklearnex" in neigh.__module__ result = neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False) assert_allclose(result, [[2, 0]]) def test_sklearnex_import_lof(): from sklearnex.neighbors import LocalOutlierFactor + X = [[7, 7, 7], [1, 0, 0], [0, 0, 1], [0, 0, 1]] lof = LocalOutlierFactor(n_neighbors=2) result = lof.fit_predict(X) - assert hasattr(lof, '_knn') - assert 'sklearnex' in lof.__module__ - assert 'sklearnex' in lof._knn.__module__ + assert hasattr(lof, "_knn") + assert "sklearnex" in lof.__module__ + assert "sklearnex" in lof._knn.__module__ assert_allclose(result, [-1, 1, 1, 1]) diff --git a/sklearnex/preview/__init__.py b/sklearnex/preview/__init__.py index 63508ba7f4..d6431ee3c7 100644 --- a/sklearnex/preview/__init__.py +++ b/sklearnex/preview/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -__all__ = ['cluster', 'decomposition', 'linear_model', 'ensemble'] +__all__ = ["cluster", "decomposition", "linear_model", "ensemble"] diff --git a/sklearnex/preview/cluster/__init__.py b/sklearnex/preview/cluster/__init__.py index 5a3f8d1447..fe90485107 100644 --- a/sklearnex/preview/cluster/__init__.py +++ b/sklearnex/preview/cluster/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .k_means import KMeans -__all__ = ['KMeans'] +__all__ = ["KMeans"] diff --git a/sklearnex/preview/cluster/_common.py b/sklearnex/preview/cluster/_common.py index ab2be9efe3..ddcbe87d9e 100644 --- a/sklearnex/preview/cluster/_common.py +++ b/sklearnex/preview/cluster/_common.py @@ -23,7 +23,7 @@ def get_cluster_centers(self): def set_cluster_centers(self, value): self._cluster_centers_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.cluster_centers_ = value @@ -33,7 +33,7 @@ def get_labels(self): def set_labels(self, value): self._labels_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.labels_ = value @@ -43,7 +43,7 @@ def get_inertia(self): def set_inertia(self, value): self._inertia_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.inertia_ = value @@ -53,7 +53,7 @@ def get_n_iter(self): def set_n_iter(self, value): self._n_iter_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.n_iter_ = value diff --git a/sklearnex/preview/cluster/k_means.py b/sklearnex/preview/cluster/k_means.py index c5a9dd4a09..80abad8c8d 100644 --- a/sklearnex/preview/cluster/k_means.py +++ b/sklearnex/preview/cluster/k_means.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,54 +13,50 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from daal4py.sklearn._utils import daal_check_version import logging -if daal_check_version((2023, 'P', 200)): +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2023, "P", 200)): import numpy as np from scipy.sparse import issparse - - from ._common import BaseKMeans - from ..._device_offload import dispatch, wrap_output_data - - from onedal.cluster import KMeans as onedal_KMeans from sklearn.cluster import KMeans as sklearn_KMeans - - from daal4py.sklearn._utils import ( - sklearn_check_version, - PatchingConditionsChain) - + from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils.validation import ( + _deprecate_positional_args, _num_samples, check_is_fitted, - _deprecate_positional_args) + ) - from sklearn.utils._openmp_helpers import _openmp_effective_n_threads + from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version + from onedal.cluster import KMeans as onedal_KMeans + + from ..._device_offload import dispatch, wrap_output_data + from ._common import BaseKMeans class KMeans(sklearn_KMeans, BaseKMeans): __doc__ = sklearn_KMeans.__doc__ n_iter_, inertia_ = None, None labels_, cluster_centers_ = None, None - if sklearn_check_version('1.2'): - _parameter_constraints: dict = { - **sklearn_KMeans._parameter_constraints} + if sklearn_check_version("1.2"): + _parameter_constraints: dict = {**sklearn_KMeans._parameter_constraints} @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', - n_init='auto' if sklearn_check_version('1.4') else 'warn', + init="k-means++", + n_init="auto" if sklearn_check_version("1.4") else "warn", max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='lloyd', + algorithm="lloyd", ): super().__init__( n_clusters=n_clusters, @@ -73,20 +69,22 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) - elif sklearn_check_version('1.0'): + + elif sklearn_check_version("1.0"): + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, verbose=0, random_state=None, copy_x=True, - algorithm='auto', + algorithm="auto", ): super().__init__( n_clusters=n_clusters, @@ -99,22 +97,24 @@ def __init__( copy_x=copy_x, algorithm=algorithm, ) + else: + @_deprecate_positional_args def __init__( self, n_clusters=8, *, - init='k-means++', + init="k-means++", n_init=10, max_iter=300, tol=1e-4, - precompute_distances='deprecated', + precompute_distances="deprecated", verbose=0, random_state=None, copy_x=True, - n_jobs='deprecated', - algorithm='auto', + n_jobs="deprecated", + algorithm="auto", ): super().__init__( n_clusters=n_clusters, @@ -132,36 +132,40 @@ def __init__( def _initialize_onedal_estimator(self): onedal_params = { - 'n_clusters': self.n_clusters, - 'init': self.init, - 'max_iter': self.max_iter, - 'tol': self.tol, - 'n_init': self.n_init, - 'verbose': self.verbose, - 'random_state': self.random_state, + "n_clusters": self.n_clusters, + "init": self.init, + "max_iter": self.max_iter, + "tol": self.tol, + "n_init": self.n_init, + "verbose": self.verbose, + "random_state": self.random_state, } self._onedal_estimator = onedal_KMeans(**onedal_params) def _onedal_fit_supported(self, method_name, X, y=None, sample_weight=None): - assert method_name == 'fit' + assert method_name == "fit" class_name = self.__class__.__name__ - patching_status = PatchingConditionsChain( - f'sklearn.cluster.{class_name}.fit') + patching_status = PatchingConditionsChain(f"sklearn.cluster.{class_name}.fit") sample_count = _num_samples(X) self._algorithm = self.algorithm supported_algs = ["auto", "full", "lloyd"] correct_count = self.n_clusters < sample_count - patching_status.and_conditions([ - (self.algorithm in supported_algs, 'Only lloyd algorithm is supported.'), - (not issparse(self.init), 'Sparse init values are not supported'), - (correct_count, 'n_clusters is smaller than number of samples'), - (sample_weight is None, 'Sample weight is not None.'), - (not issparse(X), 'Sparse input is not supported.'), - ]) + patching_status.and_conditions( + [ + ( + self.algorithm in supported_algs, + "Only lloyd algorithm is supported.", + ), + (not issparse(self.init), "Sparse init values are not supported"), + (correct_count, "n_clusters is smaller than number of samples"), + (sample_weight is None, "Sample weight is not None."), + (not issparse(X), "Sparse input is not supported."), + ] + ) return patching_status.get_status(logs=True) @@ -184,15 +188,22 @@ def fit(self, X, y=None, sample_weight=None): """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_KMeans.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_KMeans.fit, + }, + X, + y, + sample_weight, + ) return self @@ -216,20 +227,26 @@ def _onedal_fit(self, X, _, sample_weight, queue=None): self._save_attributes() def _onedal_predict_supported(self, method_name, X): - assert method_name == 'predict' + assert method_name == "predict" class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.cluster.{class_name}.predict') + f"sklearn.cluster.{class_name}.predict" + ) supported_algs = ["auto", "full", "lloyd"] dense_centers = not issparse(self.cluster_centers_) - patching_status.and_conditions([ - (self.algorithm in supported_algs, 'Only lloyd algorithm is supported.'), - (dense_centers, 'Sparse clusters is not supported.'), - (not issparse(X), 'Sparse input is not supported.') - ]) + patching_status.and_conditions( + [ + ( + self.algorithm in supported_algs, + "Only lloyd algorithm is supported.", + ), + (dense_centers, "Sparse clusters is not supported."), + (not issparse(X), "Sparse input is not supported."), + ] + ) return patching_status.get_status(logs=True) @@ -253,31 +270,37 @@ def predict(self, X): """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_KMeans.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_KMeans.predict, + }, + X, + ) def _onedal_predict(self, X, queue=None): X = self._validate_data(X, accept_sparse=False, reset=False) - if not hasattr(self, '_onedal_estimator'): + if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() self._onedal_estimator.cluster_centers_ = self.cluster_centers_ return self._onedal_estimator.predict(X, queue=queue) def _onedal_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": return self._onedal_fit_supported(method_name, *data) - if method_name == 'predict': + if method_name == "predict": return self._onedal_predict_supported(method_name, *data) raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) def _onedal_gpu_supported(self, method_name, *data): return self._onedal_supported(method_name, *data) @@ -335,5 +358,7 @@ def transform(self, X): else: from daal4py.sklearn.cluster import KMeans - logging.warning('Preview KMeans requires oneDAL version >= 2023.2 ' - 'but it was not found') + + logging.warning( + "Preview KMeans requires oneDAL version >= 2023.2 " "but it was not found" + ) diff --git a/sklearnex/preview/decomposition/__init__.py b/sklearnex/preview/decomposition/__init__.py index 4b78bc0172..02fd05199e 100644 --- a/sklearnex/preview/decomposition/__init__.py +++ b/sklearnex/preview/decomposition/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,8 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/sklearnex/preview/decomposition/pca.py b/sklearnex/preview/decomposition/pca.py index dec03700ad..ae779cfc95 100755 --- a/sklearnex/preview/decomposition/pca.py +++ b/sklearnex/preview/decomposition/pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,34 +13,36 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import numbers from math import sqrt + +import numpy as np from scipy.sparse import issparse +from sklearn.base import BaseEstimator +from sklearn.utils.extmath import stable_cumsum +from sklearn.utils.validation import check_array, check_is_fitted -from ..._device_offload import dispatch from daal4py.sklearn._utils import sklearn_check_version from onedal.utils import _check_array -from sklearn.utils.extmath import stable_cumsum -from sklearn.utils.validation import check_array -from sklearn.base import BaseEstimator -from sklearn.utils.validation import check_is_fitted -if sklearn_check_version('1.1') and not sklearn_check_version('1.2'): +from ..._device_offload import dispatch + +if sklearn_check_version("1.1") and not sklearn_check_version("1.2"): from sklearn.utils import check_scalar -if sklearn_check_version('0.23'): +if sklearn_check_version("0.23"): from sklearn.decomposition._pca import _infer_dimension else: from sklearn.decomposition._pca import _infer_dimension_ -from onedal.decomposition import PCA as onedal_PCA from sklearn.decomposition import PCA as sklearn_PCA +from onedal.decomposition import PCA as onedal_PCA + class PCA(sklearn_PCA): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_PCA._parameter_constraints} def __init__( @@ -66,13 +68,11 @@ def __init__( self.power_iteration_normalizer = power_iteration_normalizer self.random_state = random_state - def _validate_n_components(self, n_components, n_samples, - n_features, n_sf_min): + def _validate_n_components(self, n_components, n_samples, n_features, n_sf_min): if n_components == "mle": if n_samples < n_features: raise ValueError( - "n_components='mle' is only supported if" - " n_samples >= n_features" + "n_components='mle' is only supported if" " n_samples >= n_features" ) elif not 0 <= n_components <= n_sf_min: raise ValueError( @@ -82,15 +82,16 @@ def _validate_n_components(self, n_components, n_samples, ) elif n_components >= 1: if not isinstance(n_components, numbers.Integral): - raise ValueError("n_components=%r must be of type int " - "when greater than or equal to 1, " - "was of type=%r" - % (n_components, type(n_components))) + raise ValueError( + "n_components=%r must be of type int " + "when greater than or equal to 1, " + "was of type=%r" % (n_components, type(n_components)) + ) def fit(self, X, y=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): self._validate_params() - elif sklearn_check_version('1.1'): + elif sklearn_check_version("1.1"): check_scalar( self.n_oversamples, "n_oversamples", @@ -107,12 +108,14 @@ def _fit(self, X): "TruncatedSVD for a possible alternative." ) - if sklearn_check_version('0.23'): - X = self._validate_data(X, dtype=[np.float64, np.float32], - ensure_2d=True, copy=False) + if sklearn_check_version("0.23"): + X = self._validate_data( + X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False + ) else: - X = _check_array(X, dtype=[np.float64, np.float32], - ensure_2d=True, copy=False) + X = _check_array( + X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False + ) n_samples, n_features = X.shape n_sf_min = min(n_samples, n_features) @@ -125,13 +128,12 @@ def _fit(self, X): else: n_components = self.n_components - self._validate_n_components(n_components, n_samples, n_features, - n_sf_min) + self._validate_n_components(n_components, n_samples, n_features, n_sf_min) self._fit_svd_solver = self.svd_solver shape_good_for_daal = X.shape[1] / X.shape[0] < 2 if self._fit_svd_solver == "auto": - if sklearn_check_version('1.1'): + if sklearn_check_version("1.1"): if max(X.shape) <= 500 or n_components == "mle": self._fit_svd_solver = "full" elif 1 <= n_components < 0.8 * n_sf_min: @@ -139,69 +141,73 @@ def _fit(self, X): else: self._fit_svd_solver = "full" else: - if n_components == 'mle': - self._fit_svd_solver = 'full' + if n_components == "mle": + self._fit_svd_solver = "full" else: n, p, k = X.shape[0], X.shape[1], n_components # check if sklearnex is faster than randomized sklearn # Refer to daal4py - regression_coefs = np.array([ - [9.779873e-11, n * p * k], - [-1.122062e-11, n * p * p], - [1.127905e-09, n ** 2], - ]) - - if n_components >= 1 and np.dot( - regression_coefs[:, 0], - regression_coefs[:, 1]) <= 0: - self._fit_svd_solver = 'randomized' + regression_coefs = np.array( + [ + [9.779873e-11, n * p * k], + [-1.122062e-11, n * p * p], + [1.127905e-09, n**2], + ] + ) + + if ( + n_components >= 1 + and np.dot(regression_coefs[:, 0], regression_coefs[:, 1]) <= 0 + ): + self._fit_svd_solver = "randomized" else: - self._fit_svd_solver = 'full' + self._fit_svd_solver = "full" - if not shape_good_for_daal or self._fit_svd_solver != 'full': - if sklearn_check_version('0.23'): + if not shape_good_for_daal or self._fit_svd_solver != "full": + if sklearn_check_version("0.23"): X = self._validate_data(X, copy=self.copy) else: X = check_array(X, copy=self.copy) # Call different fits for either full or truncated SVD if shape_good_for_daal and self._fit_svd_solver == "full": - return dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_PCA._fit_full, - }, X) + return dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_PCA._fit_full, + }, + X, + ) elif not shape_good_for_daal and self._fit_svd_solver == "full": return sklearn_PCA._fit_full(self, X, n_components) elif self._fit_svd_solver in ["arpack", "randomized"]: return sklearn_PCA._fit_truncated( - self, X, n_components, self._fit_svd_solver, + self, + X, + n_components, + self._fit_svd_solver, ) else: - raise ValueError( - "Unrecognized svd_solver='{0}'".format(self._fit_svd_solver) - ) + raise ValueError("Unrecognized svd_solver='{0}'".format(self._fit_svd_solver)) def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': - return self._fit_svd_solver == 'full' - elif method_name == 'transform': - return hasattr(self, '_onedal_estimator') - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}' - ) + if method_name == "fit": + return self._fit_svd_solver == "full" + elif method_name == "transform": + return hasattr(self, "_onedal_estimator") + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': - return self._fit_svd_solver == 'full' - elif method_name == 'transform': - return hasattr(self, '_onedal_estimator') - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}' - ) + if method_name == "fit": + return self._fit_svd_solver == "full" + elif method_name == "transform": + return hasattr(self, "_onedal_estimator") + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y=None, queue=None): - - if self.n_components == 'mle' or self.n_components is None: + if self.n_components == "mle" or self.n_components is None: onedal_n_components = min(X.shape) elif 0 < self.n_components < 1: onedal_n_components = min(X.shape) @@ -209,9 +215,9 @@ def _onedal_fit(self, X, y=None, queue=None): onedal_n_components = self.n_components onedal_params = { - 'n_components': onedal_n_components, - 'is_deterministic': True, - 'method': "precomputed", + "n_components": onedal_n_components, + "is_deterministic": True, + "method": "precomputed", } self._onedal_estimator = onedal_PCA(**onedal_params) self._onedal_estimator.fit(X, queue=queue) @@ -227,12 +233,7 @@ def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue) def _onedal_transform(self, X): - X = _check_array( - X, - dtype=[np.float64, np.float32], - ensure_2d=True, - copy=False - ) + X = _check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, copy=False) if hasattr(self, "n_features_in_"): if self.n_features_in_ != X.shape[1]: @@ -251,10 +252,15 @@ def _onedal_transform(self, X): # Mean center X_centered = X - self.mean_ - return dispatch(self, 'transform', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_PCA.transform, - }, X_centered) + return dispatch( + self, + "transform", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_PCA.transform, + }, + X_centered, + ) def transform(self, X): check_is_fitted(self) @@ -310,13 +316,12 @@ def _save_attributes(self): self.mean_ = self._onedal_estimator.mean_ self.singular_values_ = self._onedal_estimator.singular_values_ self.explained_variance_ = self._onedal_estimator.explained_variance_ - self.explained_variance_ratio_ = \ - self._onedal_estimator.explained_variance_ratio_ + self.explained_variance_ratio_ = self._onedal_estimator.explained_variance_ratio_ if self.n_components is None: self.n_components_ = self._onedal_estimator.n_components_ - elif self.n_components == 'mle': - if sklearn_check_version('0.23'): + elif self.n_components == "mle": + if sklearn_check_version("0.23"): self.n_components_ = _infer_dimension( self.explained_variance_, self.n_samples_ ) @@ -326,23 +331,25 @@ def _save_attributes(self): ) elif 0 < self.n_components < 1.0: ratio_cumsum = stable_cumsum(self.explained_variance_ratio_) - self.n_components_ = np.searchsorted( - ratio_cumsum, self.n_components, side='right') + 1 + self.n_components_ = ( + np.searchsorted(ratio_cumsum, self.n_components, side="right") + 1 + ) else: self.n_components_ = self._onedal_estimator.n_components_ if self.n_components_ < n_sf_min: if self.explained_variance_.shape[0] == n_sf_min: - self.noise_variance_ = \ - self.explained_variance_[self.n_components_:].mean() + self.noise_variance_ = self.explained_variance_[ + self.n_components_ : + ].mean() else: self.noise_variance_ = self._onedal_estimator.noise_variance_ else: - self.noise_variance_ = 0. - - self.explained_variance_ = self.explained_variance_[:self.n_components_] - self.explained_variance_ratio_ = \ - self.explained_variance_ratio_[:self.n_components_] - self.components_ = \ - self._onedal_estimator.components_[:self.n_components_] - self.singular_values_ = self.singular_values_[:self.n_components_] + self.noise_variance_ = 0.0 + + self.explained_variance_ = self.explained_variance_[: self.n_components_] + self.explained_variance_ratio_ = self.explained_variance_ratio_[ + : self.n_components_ + ] + self.components_ = self._onedal_estimator.components_[: self.n_components_] + self.singular_values_ = self.singular_values_[: self.n_components_] diff --git a/sklearnex/preview/decomposition/tests/test_preview_pca.py b/sklearnex/preview/decomposition/tests/test_preview_pca.py index e4b4ad5c18..5a3a891bce 100755 --- a/sklearnex/preview/decomposition/tests/test_preview_pca.py +++ b/sklearnex/preview/decomposition/tests/test_preview_pca.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,8 +21,9 @@ def test_sklearnex_import(): from sklearnex.preview.decomposition import PCA + X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) - pca = PCA(n_components=2, svd_solver='full').fit(X) - assert 'sklearnex' in pca.__module__ - assert hasattr(pca, '_onedal_estimator') + pca = PCA(n_components=2, svd_solver="full").fit(X) + assert "sklearnex" in pca.__module__ + assert hasattr(pca, "_onedal_estimator") assert_allclose(pca.singular_values_, [6.30061232, 0.54980396]) diff --git a/sklearnex/preview/ensemble/__init__.py b/sklearnex/preview/ensemble/__init__.py index e7a8fde386..cce939b4cf 100755 --- a/sklearnex/preview/ensemble/__init__.py +++ b/sklearnex/preview/ensemble/__init__.py @@ -15,8 +15,12 @@ # limitations under the License. # =============================================================================== -from .forest import RandomForestClassifier, RandomForestRegressor from .extra_trees import ExtraTreesClassifier, ExtraTreesRegressor +from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['ExtraTreesClassifier', 'ExtraTreesRegressor', - 'RandomForestClassifier', 'RandomForestRegressor'] +__all__ = [ + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "RandomForestClassifier", + "RandomForestRegressor", +] diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index a6b5ba8398..4f6a17621b 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,51 +13,44 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== - -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, - make2d, PatchingConditionsChain, check_tree_nodes -) - -import numpy as np +# =============================================================================== import numbers - import warnings - from abc import ABC -from sklearn.exceptions import DataConversionWarning - -from ..._config import get_config -from ..._device_offload import dispatch, wrap_output_data - +import numpy as np +from scipy import sparse as sp +from sklearn.base import clone from sklearn.ensemble import ExtraTreesClassifier as sklearn_ExtraTreesClassifier from sklearn.ensemble import ExtraTreesRegressor as sklearn_ExtraTreesRegressor - -from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, - check_array, - check_X_y) - -from onedal.utils import _num_features, _num_samples - -from sklearn.utils import check_random_state, deprecated - -from sklearn.base import clone - +from sklearn.exceptions import DataConversionWarning from sklearn.tree import ExtraTreeClassifier, ExtraTreeRegressor from sklearn.tree._tree import Tree +from sklearn.utils import check_random_state, deprecated +from sklearn.utils.validation import ( + check_array, + check_consistent_length, + check_is_fitted, + check_X_y, +) +from daal4py.sklearn._utils import ( + PatchingConditionsChain, + check_tree_nodes, + daal_check_version, + make2d, + sklearn_check_version, +) from onedal.ensemble import ExtraTreesClassifier as onedal_ExtraTreesClassifier from onedal.ensemble import ExtraTreesRegressor as onedal_ExtraTreesRegressor from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.utils import _num_features, _num_samples -from scipy import sparse as sp +from ..._config import get_config +from ..._device_offload import dispatch, wrap_output_data -if sklearn_check_version('1.2'): +if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval @@ -69,7 +62,7 @@ def _fit_proba(self, X, y, sample_weight=None, queue=None): # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue def _save_attributes(self): self._onedal_model = self._onedal_estimator._onedal_model @@ -79,8 +72,9 @@ def _save_attributes(self): if hasattr(self._onedal_estimator, "oob_prediction_"): self.oob_prediction_ = self._onedal_estimator.oob_prediction_ if hasattr(self._onedal_estimator, "oob_decision_function_"): - self.oob_decision_function_ = \ + self.oob_decision_function_ = ( self._onedal_estimator.oob_decision_function_ + ) return self def _onedal_classifier(self, **onedal_params): @@ -92,69 +86,79 @@ def _onedal_regressor(self, **onedal_params): # TODO: # move to onedal modul. def _check_parameters(self): - if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if getattr(self, "min_impurity_split", None) is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if getattr(self, "min_impurity_split") < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if getattr(self, "min_impurity_split") < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def check_sample_weight(self, sample_weight, X, dtype=None): n_samples = _num_samples(X) @@ -174,49 +178,55 @@ def check_sample_weight(self, sample_weight, X, dtype=None): accept_sparse=False, ensure_2d=False, dtype=dtype, - order="C") + order="C", + ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight class ExtraTreesClassifier(sklearn_ExtraTreesClassifier, BaseTree): __doc__ = sklearn_ExtraTreesClassifier.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_ExtraTreesClassifier._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], - "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")] + "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -233,7 +243,7 @@ def __init__( random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight + class_weight=class_weight, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -242,28 +252,31 @@ def __init__( self.min_bin_size = min_bin_size else: - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -283,7 +296,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -317,17 +330,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_ExtraTreesClassifier.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_ExtraTreesClassifier.fit, + }, + X, + y, + sample_weight, + ) return self def _onedal_fit_ready(self, patching_status, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() @@ -335,22 +353,33 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): self._check_parameters() if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") - - ready = patching_status.and_conditions([ - (self.oob_score and daal_check_version((2021, 'P', 500)) or not - self.oob_score, - "OOB score is only supported starting from 2021.5 version of oneDAL."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.ccp_alpha == 0.0, - f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), - (self.criterion == "gini", - f"'{self.criterion}' criterion is not supported. " - "Only 'gini' criterion is supported."), - (self.warm_start is False, "Warm start is not supported."), - (self.n_estimators <= 6024, "More than 6024 estimators is not supported.") - ]) + raise ValueError("Out of bag estimation only available" " if bootstrap=True") + + ready = patching_status.and_conditions( + [ + ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score, + "OOB score is only supported starting from 2021.5 version of oneDAL.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.ccp_alpha == 0.0, + f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported.", + ), + ( + self.criterion == "gini", + f"'{self.criterion}' criterion is not supported. " + "Only 'gini' criterion is supported.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + self.n_estimators <= 6024, + "More than 6024 estimators is not supported.", + ), + ] + ) if ready: if sklearn_check_version("1.0"): @@ -364,22 +393,29 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) check_consistent_length(X, y) y = make2d(y) self.n_outputs_ = y.shape[1] - ready = patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - (y.dtype in [np.float32, np.float64, np.int32, np.int64], - f"Datatype ({y.dtype}) for y is not supported.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ( + y.dtype in [np.float32, np.float64, np.int32, np.int64], + f"Datatype ({y.dtype}) for y is not supported.", + ), + ] + ) # TODO: Fix to support integers as input n_samples = X.shape[0] if isinstance(self.max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= self.max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) @@ -388,9 +424,9 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): msg = "`max_samples` must be <= n_samples={} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) elif isinstance(self.max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(self.max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(self.max_samples)) @@ -433,10 +469,15 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_ExtraTreesClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_ExtraTreesClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): @@ -467,54 +508,64 @@ def predict_proba(self, X): # self._check_proba() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - if hasattr(self, 'n_features_in_'): + if hasattr(self, "n_features_in_"): try: num_features = _num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( - (f'X has {num_features} features, ' - f'but ExtraTreesClassifier is expecting ' - f'{self.n_features_in_} features as input')) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_ExtraTreesClassifier.predict_proba, - }, X) - - if sklearn_check_version('1.0'): + ( + f"X has {num_features} features, " + f"but ExtraTreesClassifier is expecting " + f"{self.n_features_in_} features as input" + ) + ) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_ExtraTreesClassifier.predict_proba, + }, + X, + ) + + if sklearn_check_version("1.0"): + @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = ExtraTreeClassifier(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -523,29 +574,27 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, - np.array( - [n_classes_], - dtype=np.intp), - self.n_outputs_) + np.array([n_classes_], dtype=np.intp), + self.n_outputs_, + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -555,48 +604,64 @@ def _estimators_(self): def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2"), - (not sp.issparse(sample_weight), "sample_weight is sparse. " - "Sparse input is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ( + not sp.issparse(sample_weight), + "sample_weight is sparse. " "Sparse input is not supported.", + ), + ] + ) - dal_ready = dal_ready and not hasattr(self, 'estimators_') + dal_ready = dal_ready and not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.2") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready = dal_ready and _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready = dal_ready and _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: dal_ready = False else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready @@ -604,62 +669,85 @@ def _onedal_cpu_supported(self, method_name, *data): def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1"), - (sample_weight is not None, "sample_weight is not supported.") - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + (sample_weight is not None, "sample_weight is not supported."), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = hasattr(self, '_onedal_model') and hasattr(self, 'n_outputs_') + dal_ready = hasattr(self, "_onedal_model") and hasattr(self, "n_outputs_") if dal_ready: - dal_ready = _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1") - ]) + dal_ready = _patching_status.and_conditions( + [ + ( + not sp.issparse(X), + "X is sparse. Sparse input is not supported.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + ] + ) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready def _onedal_fit(self, X, y, sample_weight=None, queue=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data( - X, y, multi_output=False, accept_sparse=False, - dtype=[np.float64, np.float32] + X, + y, + multi_output=False, + accept_sparse=False, + dtype=[np.float64, np.float32], ) else: X, y = check_X_y( - X, y, accept_sparse=False, dtype=[np.float64, np.float32], - multi_output=False + X, + y, + accept_sparse=False, + dtype=[np.float64, np.float32], + multi_output=False, ) if sample_weight is not None: @@ -683,7 +771,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: @@ -695,43 +783,42 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): sample_weight = [sample_weight] if n_classes_ < 2: - raise ValueError( - "Training data only contain information about one class.") + raise ValueError("Training data only contain information about one class.") if self.oob_score: - err = 'out_of_bag_error_accuracy|out_of_bag_error_decision_function' + err = "out_of_bag_error_accuracy|out_of_bag_error_decision_function" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': self.random_state, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'class_weight': self.class_weight, - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": self.random_state, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "class_weight": self.class_weight, + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = "random" - if not sklearn_check_version('1.0'): - onedal_params['min_impurity_split'] = self.min_impurity_split + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = "random" + if not sklearn_check_version("1.0"): + onedal_params["min_impurity_split"] = self.min_impurity_split else: - onedal_params['min_impurity_split'] = None + onedal_params["min_impurity_split"] = None self._cached_estimators_ = None # Compute @@ -754,13 +841,12 @@ def _onedal_predict(self, X, queue=None): self._check_feature_names(X, reset=False) res = self._onedal_estimator.predict(X, queue=queue) - return np.take(self.classes_, - res.ravel().astype(np.int64, casting='unsafe')) + return np.take(self.classes_, res.ravel().astype(np.int64, casting="unsafe")) def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -770,36 +856,38 @@ def _onedal_predict_proba(self, X, queue=None): class ExtraTreesRegressor(sklearn_ExtraTreesRegressor, BaseTree): __doc__ = sklearn_ExtraTreesRegressor.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_ExtraTreesRegressor._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], - "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")] + "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -815,36 +903,40 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start + warm_start=warm_start, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha self.max_samples = max_samples self.max_bins = max_bins self.min_bin_size = min_bin_size + else: - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1 - ): + + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -863,7 +955,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -873,27 +965,27 @@ def __init__(self, @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = ExtraTreeRegressor(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -902,26 +994,25 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( - self.n_features_in_, np.array( - [1], dtype=np.intp), self.n_outputs_) + self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -929,9 +1020,7 @@ def _estimators_(self): def _onedal_fit_ready(self, patching_status, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() @@ -939,30 +1028,41 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): self._check_parameters() if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") - if sklearn_check_version('1.0') and self.criterion == "mse": + if sklearn_check_version("1.0") and self.criterion == "mse": warnings.warn( "Criterion 'mse' was deprecated in v1.0 and will be " "removed in version 1.2. Use `criterion='squared_error'` " "which is equivalent.", - FutureWarning + FutureWarning, ) - ready = patching_status.and_conditions([ - (self.oob_score and daal_check_version((2021, 'P', 500)) or not - self.oob_score, - "OOB score is only supported starting from 2021.5 version of oneDAL."), - (self.warm_start is False, "Warm start is not supported."), - (self.criterion in ["mse", "squared_error"], - f"'{self.criterion}' criterion is not supported. " - "Only 'mse' and 'squared_error' criteria are supported."), - (self.ccp_alpha == 0.0, - f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.n_estimators <= 6024, "More than 6024 estimators is not supported.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score, + "OOB score is only supported starting from 2021.5 version of oneDAL.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + self.criterion in ["mse", "squared_error"], + f"'{self.criterion}' criterion is not supported. " + "Only 'mse' and 'squared_error' criteria are supported.", + ), + ( + self.ccp_alpha == 0.0, + f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.n_estimators <= 6024, + "More than 6024 estimators is not supported.", + ), + ] + ) if ready: if sklearn_check_version("1.0"): @@ -972,10 +1072,13 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: - warnings.warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2) + warnings.warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) @@ -986,14 +1089,18 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] - ready = patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ) + ] + ) n_samples = X.shape[0] if isinstance(self.max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= self.max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) @@ -1002,9 +1109,9 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): msg = "`max_samples` must be <= n_samples={} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) elif isinstance(self.max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(self.max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(self.max_samples)) @@ -1028,48 +1135,64 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2"), - (not sp.issparse(sample_weight), "sample_weight is sparse. " - "Sparse input is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ( + not sp.issparse(sample_weight), + "sample_weight is sparse. " "Sparse input is not supported.", + ), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: dal_ready = False else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready @@ -1077,55 +1200,66 @@ def _onedal_cpu_supported(self, method_name, *data): def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1"), - (sample_weight is not None, "sample_weight is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + (sample_weight is not None, "sample_weight is not supported."), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready def _onedal_fit(self, X, y, sample_weight=None, queue=None): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() else: @@ -1142,41 +1276,42 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if self.oob_score: - err = 'out_of_bag_error_r2|out_of_bag_error_prediction' + err = "out_of_bag_error_r2|out_of_bag_error_prediction" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': rs_, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": rs_, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = "random" + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = "random" self._cached_estimators_ = None self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) @@ -1219,10 +1354,17 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_ExtraTreesRegressor.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_ExtraTreesRegressor.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -1247,15 +1389,22 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_ExtraTreesRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_ExtraTreesRegressor.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index ef177b1bc2..99d36a2c2f 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -15,49 +15,41 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, - make2d, check_tree_nodes -) - -import numpy as np - import numbers - import warnings - from abc import ABC -from sklearn.exceptions import DataConversionWarning - -from ..._config import get_config -from ..._device_offload import dispatch, wrap_output_data - +import numpy as np +from scipy import sparse as sp +from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier as sklearn_RandomForestClassifier from sklearn.ensemble import RandomForestRegressor as sklearn_RandomForestRegressor - -from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, - check_array, - check_X_y) - -from onedal.utils import _num_features, _num_samples - -from sklearn.utils import check_random_state, deprecated - -from sklearn.base import clone - +from sklearn.exceptions import DataConversionWarning from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.tree._tree import Tree +from sklearn.utils import check_random_state, deprecated +from sklearn.utils.validation import ( + check_array, + check_consistent_length, + check_is_fitted, + check_X_y, +) +from daal4py.sklearn._utils import ( + check_tree_nodes, + daal_check_version, + make2d, + sklearn_check_version, +) from onedal.ensemble import RandomForestClassifier as onedal_RandomForestClassifier from onedal.ensemble import RandomForestRegressor as onedal_RandomForestRegressor from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.utils import _num_features, _num_samples -from scipy import sparse as sp +from ..._config import get_config +from ..._device_offload import dispatch, wrap_output_data -if sklearn_check_version('1.2'): +if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval, StrOptions @@ -69,7 +61,7 @@ def _fit_proba(self, X, y, sample_weight=None, queue=None): # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue def _save_attributes(self): self._onedal_model = self._onedal_estimator._onedal_model @@ -97,66 +89,77 @@ def _check_parameters(self): ) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if self.min_impurity_split is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if self.min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if self.min_impurity_split < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def check_sample_weight(self, sample_weight, X, dtype=None): n_samples = _num_samples(X) @@ -176,51 +179,57 @@ def check_sample_weight(self, sample_weight, X, dtype=None): accept_sparse=False, ensure_2d=False, dtype=dtype, - order="C") + order="C", + ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight class RandomForestClassifier(sklearn_RandomForestClassifier, BaseRandomForest): __doc__ = sklearn_RandomForestClassifier.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_RandomForestClassifier._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], - "splitter_mode": [StrOptions({"best", "random"})] + "splitter_mode": [StrOptions({"best", "random"})], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -237,7 +246,7 @@ def __init__( random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight + class_weight=class_weight, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -247,30 +256,34 @@ def __init__( self.min_impurity_split = None self.splitter_mode = splitter_mode # self._estimator = DecisionTreeClassifier() + else: - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -290,7 +303,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -327,17 +340,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_RandomForestClassifier.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_RandomForestClassifier.fit, + }, + X, + y, + sample_weight, + ) return self def _onedal_ready(self, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if not self.bootstrap and self.max_samples is not None: raise ValueError( "`max_sample` cannot be set if `bootstrap=False`. " @@ -345,8 +363,7 @@ def _onedal_ready(self, X, y, sample_weight): "`max_sample=None`." ) if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") if sklearn_check_version("1.2"): self._validate_params() else: @@ -357,16 +374,20 @@ def _onedal_ready(self, X, y, sample_weight): correct_criterion = self.criterion == "gini" correct_warm_start = self.warm_start is False - if daal_check_version((2021, 'P', 500)): + if daal_check_version((2021, "P", 500)): correct_oob_score = not self.oob_score else: correct_oob_score = self.oob_score - ready = all([correct_oob_score, - correct_sparsity, - correct_ccp_alpha, - correct_criterion, - correct_warm_start]) + ready = all( + [ + correct_oob_score, + correct_sparsity, + correct_ccp_alpha, + correct_criterion, + correct_warm_start, + ] + ) if ready: if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) @@ -379,7 +400,8 @@ def _onedal_ready(self, X, y, sample_weight): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) check_consistent_length(X, y) y = make2d(y) @@ -412,10 +434,15 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_RandomForestClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_RandomForestClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): @@ -446,54 +473,64 @@ def predict_proba(self, X): # self._check_proba() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - if hasattr(self, 'n_features_in_'): + if hasattr(self, "n_features_in_"): try: num_features = _num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( - (f'X has {num_features} features, ' - f'but RandomForestClassifier is expecting ' - f'{self.n_features_in_} features as input')) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_RandomForestClassifier.predict_proba, - }, X) - - if sklearn_check_version('1.0'): + ( + f"X has {num_features} features, " + f"but RandomForestClassifier is expecting " + f"{self.n_features_in_} features as input" + ) + ) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_RandomForestClassifier.predict_proba, + }, + X, + ) + + if sklearn_check_version("1.0"): + @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = DecisionTreeClassifier(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -502,29 +539,27 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, - np.array( - [n_classes_], - dtype=np.intp), - self.n_outputs_) + np.array([n_classes_], dtype=np.intp), + self.n_outputs_, + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -532,13 +567,16 @@ def _estimators_(self): return estimators_ def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random': - warnings.warn("'random' splitter mode supports GPU devices only " - "and requires oneDAL version >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random": + warnings.warn( + "'random' splitter mode supports GPU devices only " + "and requires oneDAL version >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False elif sp.issparse(X): @@ -551,39 +589,42 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif self.warm_start: return False - elif self.oob_score and not daal_check_version((2023, 'P', 101)): + elif self.oob_score and not daal_check_version((2023, "P", 101)): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name in ['predict', 'predict_proba']: + if method_name in ["predict", "predict_proba"]: X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random' and \ - not daal_check_version((2023, 'P', 101)): - warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random" and not daal_check_version( + (2023, "P", 101) + ): + warnings.warn( + "'random' splitter mode requires OneDAL >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False elif sp.issparse(X): @@ -602,37 +643,42 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name in ['predict', 'predict_proba']: + if method_name in ["predict", "predict_proba"]: X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data( - X, y, multi_output=False, accept_sparse=False, - dtype=[np.float64, np.float32] + X, + y, + multi_output=False, + accept_sparse=False, + dtype=[np.float64, np.float32], ) else: X, y = check_X_y( - X, y, accept_sparse=False, dtype=[np.float64, np.float32], - multi_output=False + X, + y, + accept_sparse=False, + dtype=[np.float64, np.float32], + multi_output=False, ) if sample_weight is not None: @@ -656,7 +702,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: @@ -668,40 +714,39 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): sample_weight = [sample_weight] if n_classes_ < 2: - raise ValueError( - "Training data only contain information about one class.") + raise ValueError("Training data only contain information about one class.") if self.oob_score: - err = 'out_of_bag_error_accuracy|out_of_bag_error_decision_function' + err = "out_of_bag_error_accuracy|out_of_bag_error_decision_function" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'min_impurity_split': self.min_impurity_split, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': self.random_state, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'class_weight': self.class_weight, - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "min_impurity_split": self.min_impurity_split, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": self.random_state, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "class_weight": self.class_weight, + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode self._cached_estimators_ = None # Compute @@ -724,13 +769,12 @@ def _onedal_predict(self, X, queue=None): self._check_feature_names(X, reset=False) res = self._onedal_estimator.predict(X, queue=queue) - return np.take(self.classes_, - res.ravel().astype(np.int64, casting='unsafe')) + return np.take(self.classes_, res.ravel().astype(np.int64, casting="unsafe")) def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -740,38 +784,40 @@ def _onedal_predict_proba(self, X, queue=None): class RandomForestRegressor(sklearn_RandomForestRegressor, BaseRandomForest): __doc__ = sklearn_RandomForestRegressor.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_RandomForestRegressor._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], - "splitter_mode": [StrOptions({"best", "random"})] + "splitter_mode": [StrOptions({"best", "random"})], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -787,7 +833,7 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start + warm_start=warm_start, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -796,29 +842,34 @@ def __init__( self.min_bin_size = min_bin_size self.min_impurity_split = None self.splitter_mode = splitter_mode + else: - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -837,7 +888,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -849,27 +900,27 @@ def __init__(self, @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = DecisionTreeRegressor(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -878,26 +929,25 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( - self.n_features_in_, np.array( - [1], dtype=np.intp), self.n_outputs_) + self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -920,17 +970,23 @@ def _onedal_ready(self, X, y, sample_weight): return ready, X, y, sample_weight def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random': - warnings.warn("'random' splitter mode supports GPU devices only " - "and requires oneDAL version >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random": + warnings.warn( + "'random' splitter mode supports GPU devices only " + "and requires oneDAL version >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False - elif not (self.oob_score and daal_check_version( - (2021, 'P', 500)) or not self.oob_score): + elif not ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score + ): return False elif self.criterion not in ["mse", "squared_error"]: return False @@ -944,42 +1000,48 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif self.warm_start: return False - elif self.oob_score and not daal_check_version((2023, 'P', 101)): + elif self.oob_score and not daal_check_version((2023, "P", 101)): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name == 'predict': - if not hasattr(self, '_onedal_model'): + if method_name == "predict": + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(data[0]): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random' and \ - not daal_check_version((2023, 'P', 101)): - warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random" and not daal_check_version( + (2023, "P", 101) + ): + warnings.warn( + "'random' splitter mode requires OneDAL >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False - elif not (self.oob_score and daal_check_version( - (2021, 'P', 500)) or not self.oob_score): + elif not ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score + ): return False elif self.criterion not in ["mse", "squared_error"]: return False @@ -995,32 +1057,29 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif self.oob_score: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name == 'predict': + if method_name == "predict": X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() else: @@ -1034,37 +1093,37 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if self.oob_score: - err = 'out_of_bag_error_r2|out_of_bag_error_prediction' + err = "out_of_bag_error_r2|out_of_bag_error_prediction" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': rs_, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": rs_, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode self._cached_estimators_ = None self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) @@ -1113,10 +1172,17 @@ def fit(self, X, y, sample_weight=None): "Either switch to `bootstrap=True` or set " "`max_sample=None`." ) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_RandomForestRegressor.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_RandomForestRegressor.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -1141,15 +1207,22 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_RandomForestRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_RandomForestRegressor.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ diff --git a/sklearnex/preview/ensemble/tests/test_preview_ensemble.py b/sklearnex/preview/ensemble/tests/test_preview_ensemble.py index 25bd992e60..0a064e8ed0 100755 --- a/sklearnex/preview/ensemble/tests/test_preview_ensemble.py +++ b/sklearnex/preview/ensemble/tests/test_preview_ensemble.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,55 +13,68 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from numpy.testing import assert_allclose from sklearn.datasets import make_classification, make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_rf_classifier(): from sklearnex.preview.ensemble import RandomForestClassifier - X, y = make_classification(n_samples=1000, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) + + X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) rf = RandomForestClassifier(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex.preview' in rf.__module__ + assert "sklearnex.preview" in rf.__module__ assert_allclose([1], rf.predict([[0, 0, 0, 0]])) def test_sklearnex_import_rf_regression(): from sklearnex.preview.ensemble import RandomForestRegressor - X, y = make_regression(n_features=4, n_informative=2, - random_state=0, shuffle=False) + + X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) rf = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex.preview' in rf.__module__ + assert "sklearnex.preview" in rf.__module__ pred = rf.predict([[0, 0, 0, 0]]) assert_allclose([-6.839], pred, atol=1e-2) def test_sklearnex_import_et_classifier(): from sklearnex.preview.ensemble import ExtraTreesClassifier - X, y = make_classification(n_samples=1000, n_features=4, - n_informative=2, n_redundant=0, - random_state=0, shuffle=False) + + X, y = make_classification( + n_samples=1000, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=0, + shuffle=False, + ) # For the 2023.2 release, random_state is not supported # defaults to seed=777, although it is set to 0 rf = ExtraTreesClassifier(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex' in rf.__module__ + assert "sklearnex" in rf.__module__ assert_allclose([1], rf.predict([[0, 0, 0, 0]])) def test_sklearnex_import_et_regression(): from sklearnex.preview.ensemble import ExtraTreesRegressor - X, y = make_regression(n_features=4, n_informative=2, - random_state=0, shuffle=False) + + X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False) # For the 2023.2 release, random_state is not supported # defaults to seed=777, although it is set to 0 rf = ExtraTreesRegressor(max_depth=2, random_state=0).fit(X, y) - assert 'sklearnex' in rf.__module__ + assert "sklearnex" in rf.__module__ pred = rf.predict([[0, 0, 0, 0]]) - if daal_check_version((2023, 'P', 200)): + if daal_check_version((2023, "P", 200)): assert_allclose([27.138], pred, atol=1e-2) else: assert_allclose([-2.826], pred, atol=1e-2) diff --git a/sklearnex/preview/linear_model/__init__.py b/sklearnex/preview/linear_model/__init__.py index 463003bb1d..a244f823a8 100755 --- a/sklearnex/preview/linear_model/__init__.py +++ b/sklearnex/preview/linear_model/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,10 +13,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear import LinearRegression -__all__ = [ - 'LinearRegression' -] +__all__ = ["LinearRegression"] diff --git a/sklearnex/preview/linear_model/_common.py b/sklearnex/preview/linear_model/_common.py index d08ed98ede..c93241874c 100644 --- a/sklearnex/preview/linear_model/_common.py +++ b/sklearnex/preview/linear_model/_common.py @@ -15,13 +15,13 @@ # =============================================================================== from abc import ABC -import numpy as np -from daal4py.sklearn._utils import sklearn_check_version +import numpy as np +from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder -from sklearn.calibration import CalibratedClassifierCV +from daal4py.sklearn._utils import sklearn_check_version from onedal.utils import _column_or_1d @@ -31,7 +31,7 @@ def get_coef(self): def set_coef(self, value): self._coef_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.coef_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model @@ -43,7 +43,7 @@ def get_intercept(self): def set_intercept(self, value): self._intercept_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.intercept_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model diff --git a/sklearnex/preview/linear_model/linear.py b/sklearnex/preview/linear_model/linear.py index 46569d143b..1f0d69ea48 100644 --- a/sklearnex/preview/linear_model/linear.py +++ b/sklearnex/preview/linear_model/linear.py @@ -14,35 +14,40 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import daal_check_version import logging -if daal_check_version((2023, 'P', 100)): +from daal4py.sklearn._utils import daal_check_version + +if daal_check_version((2023, "P", 100)): import numpy as np + from sklearn.linear_model import LinearRegression as sklearn_LinearRegression - from ._common import BaseLinearRegression - from ..._device_offload import dispatch, wrap_output_data + from daal4py.sklearn._utils import ( + PatchingConditionsChain, + get_dtype, + make2d, + sklearn_check_version, + ) + from ..._device_offload import dispatch, wrap_output_data from ...utils.validation import _assert_all_finite - from daal4py.sklearn._utils import ( - get_dtype, make2d, sklearn_check_version, PatchingConditionsChain) - from sklearn.linear_model import LinearRegression as sklearn_LinearRegression + from ._common import BaseLinearRegression - if sklearn_check_version('1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): from sklearn.linear_model._base import _deprecate_normalize - from sklearn.utils.validation import _deprecate_positional_args, check_X_y - from sklearn.exceptions import NotFittedError from scipy.sparse import issparse + from sklearn.exceptions import NotFittedError + from sklearn.utils.validation import _deprecate_positional_args, check_X_y from onedal.linear_model import LinearRegression as onedal_LinearRegression - from onedal.utils import (_num_features, _num_samples) + from onedal.utils import _num_features, _num_samples class LinearRegression(sklearn_LinearRegression, BaseLinearRegression): __doc__ = sklearn_LinearRegression.__doc__ intercept_, coef_ = None, None - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_LinearRegression._parameter_constraints } @@ -60,11 +65,13 @@ def __init__( n_jobs=n_jobs, positive=positive, ) - elif sklearn_check_version('0.24'): + + elif sklearn_check_version("0.24"): + def __init__( self, fit_intercept=True, - normalize='deprecated' if sklearn_check_version('1.0') else False, + normalize="deprecated" if sklearn_check_version("1.0") else False, copy_X=True, n_jobs=None, positive=False, @@ -76,7 +83,9 @@ def __init__( n_jobs=n_jobs, positive=positive, ) + else: + def __init__( self, fit_intercept=True, @@ -88,7 +97,7 @@ def __init__( fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, - n_jobs=n_jobs + n_jobs=n_jobs, ) def fit(self, X, y, sample_weight=None): @@ -109,15 +118,22 @@ def fit(self, X, y, sample_weight=None): self : object Fitted Estimator. """ - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) if sklearn_check_version("1.2"): self._validate_params() - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_LinearRegression.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_LinearRegression.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -135,16 +151,21 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_LinearRegression.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_LinearRegression.predict, + }, + X, + ) def _test_type_and_finiteness(self, X_in): X = X_in if isinstance(X_in, np.ndarray) else np.asarray(X_in) dtype = X.dtype - if 'complex' in str(type(dtype)): + if "complex" in str(type(dtype)): return False try: @@ -154,79 +175,99 @@ def _test_type_and_finiteness(self, X_in): return True def _onedal_fit_supported(self, method_name, *data): - assert method_name == 'fit' + assert method_name == "fit" assert len(data) == 3 X, y, sample_weight = data class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.linear_model.{class_name}.fit') + f"sklearn.linear_model.{class_name}.fit" + ) - normalize_is_set = hasattr(self, 'normalize') and self.normalize \ - and self.normalize != 'deprecated' - positive_is_set = hasattr(self, 'positive') and self.positive + normalize_is_set = ( + hasattr(self, "normalize") + and self.normalize + and self.normalize != "deprecated" + ) + positive_is_set = hasattr(self, "positive") and self.positive n_samples = _num_samples(X) n_features = _num_features(X, fallback_1d=True) # Check if equations are well defined - is_good_for_onedal = n_samples > \ - (n_features + int(self.fit_intercept)) - - dal_ready = patching_status.and_conditions([ - (sample_weight is None, 'Sample weight is not supported.'), - (not issparse(X) and not issparse(y), 'Sparse input is not supported.'), - (not normalize_is_set, 'Normalization is not supported.'), - (not positive_is_set, 'Forced positive coefficients are not supported.'), - (is_good_for_onedal, - 'The shape of X (fitting) does not satisfy oneDAL requirements:.' - 'Number of features + 1 >= number of samples.') - ]) + is_good_for_onedal = n_samples > (n_features + int(self.fit_intercept)) + + dal_ready = patching_status.and_conditions( + [ + (sample_weight is None, "Sample weight is not supported."), + ( + not issparse(X) and not issparse(y), + "Sparse input is not supported.", + ), + (not normalize_is_set, "Normalization is not supported."), + ( + not positive_is_set, + "Forced positive coefficients are not supported.", + ), + ( + is_good_for_onedal, + "The shape of X (fitting) does not satisfy oneDAL requirements:." + "Number of features + 1 >= number of samples.", + ), + ] + ) if not dal_ready: return patching_status.get_status(logs=True) if not patching_status.and_condition( - self._test_type_and_finiteness(X), 'Input X is not supported.' + self._test_type_and_finiteness(X), "Input X is not supported." ): return patching_status.get_status(logs=True) patching_status.and_condition( - self._test_type_and_finiteness(y), 'Input y is not supported.') + self._test_type_and_finiteness(y), "Input y is not supported." + ) return patching_status.get_status(logs=True) def _onedal_predict_supported(self, method_name, *data): - assert method_name == 'predict' + assert method_name == "predict" assert len(data) == 1 class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.linear_model.{class_name}.predict') + f"sklearn.linear_model.{class_name}.predict" + ) n_samples = _num_samples(*data) - model_is_sparse = issparse(self.coef_) or \ - (self.fit_intercept and issparse(self.intercept_)) - dal_ready = patching_status.and_conditions([ - (n_samples > 0, 'Number of samples is less than 1.'), - (not issparse(*data), 'Sparse input is not supported.'), - (not model_is_sparse, 'Sparse coefficients are not supported.'), - (hasattr(self, '_onedal_estimator'), 'oneDAL model was not trained.') - ]) + model_is_sparse = issparse(self.coef_) or ( + self.fit_intercept and issparse(self.intercept_) + ) + dal_ready = patching_status.and_conditions( + [ + (n_samples > 0, "Number of samples is less than 1."), + (not issparse(*data), "Sparse input is not supported."), + (not model_is_sparse, "Sparse coefficients are not supported."), + (hasattr(self, "_onedal_estimator"), "oneDAL model was not trained."), + ] + ) if not dal_ready: return patching_status.get_status(logs=True) patching_status.and_condition( - self._test_type_and_finiteness(*data), 'Input X is not supported.') + self._test_type_and_finiteness(*data), "Input X is not supported." + ) return patching_status.get_status(logs=True) def _onedal_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": return self._onedal_fit_supported(method_name, *data) - if method_name == 'predict': + if method_name == "predict": return self._onedal_predict_supported(method_name, *data) raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) def _onedal_gpu_supported(self, method_name, *data): return self._onedal_supported(method_name, *data) @@ -235,30 +276,27 @@ def _onedal_cpu_supported(self, method_name, *data): return self._onedal_supported(method_name, *data) def _initialize_onedal_estimator(self): - onedal_params = { - 'fit_intercept': self.fit_intercept, - 'copy_X': self.copy_X} + onedal_params = {"fit_intercept": self.fit_intercept, "copy_X": self.copy_X} self._onedal_estimator = onedal_LinearRegression(**onedal_params) def _onedal_fit(self, X, y, sample_weight, queue=None): assert sample_weight is None check_params = { - 'X': X, - 'y': y, - 'dtype': [np.float64, np.float32], - 'accept_sparse': ['csr', 'csc', 'coo'], - 'y_numeric': True, - 'multi_output': True, - 'force_all_finite': False + "X": X, + "y": y, + "dtype": [np.float64, np.float32], + "accept_sparse": ["csr", "csc", "coo"], + "y_numeric": True, + "multi_output": True, + "force_all_finite": False, } - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data(**check_params) else: X, y = check_X_y(**check_params) - if sklearn_check_version( - '1.0') and not sklearn_check_version('1.2'): + if sklearn_check_version("1.0") and not sklearn_check_version("1.2"): self._normalize = _deprecate_normalize( self.normalize, default=False, @@ -272,7 +310,7 @@ def _onedal_fit(self, X, y, sample_weight, queue=None): def _onedal_predict(self, X, queue=None): X = self._validate_data(X, accept_sparse=False, reset=False) - if not hasattr(self, '_onedal_estimator'): + if not hasattr(self, "_onedal_estimator"): self._initialize_onedal_estimator() self._onedal_estimator.coef_ = self.coef_ self._onedal_estimator.intercept_ = self.intercept_ @@ -281,5 +319,8 @@ def _onedal_predict(self, X, queue=None): else: from daal4py.sklearn.linear_model import LinearRegression - logging.warning('Preview LinearRegression requires oneDAL version >= 2023.1 ' - 'but it was not found') + + logging.warning( + "Preview LinearRegression requires oneDAL version >= 2023.1 " + "but it was not found" + ) diff --git a/sklearnex/preview/linear_model/tests/test_preview_linear.py b/sklearnex/preview/linear_model/tests/test_preview_linear.py index fe39460136..80d00324e4 100755 --- a/sklearnex/preview/linear_model/tests/test_preview_linear.py +++ b/sklearnex/preview/linear_model/tests/test_preview_linear.py @@ -18,19 +18,21 @@ import numpy as np from numpy.testing import assert_allclose from sklearn.datasets import make_regression + from daal4py.sklearn._utils import daal_check_version def test_sklearnex_import_linear(): from sklearnex.preview.linear_model import LinearRegression + X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]]) y = np.dot(X, np.array([1, 2])) + 3 linreg = LinearRegression().fit(X, y) - if daal_check_version((2023, 'P', 100)): - assert 'sklearnex' in linreg.__module__ - assert hasattr(linreg, '_onedal_estimator') + if daal_check_version((2023, "P", 100)): + assert "sklearnex" in linreg.__module__ + assert hasattr(linreg, "_onedal_estimator") else: - assert 'daal4py' in linreg.__module__ + assert "daal4py" in linreg.__module__ assert linreg.n_features_in_ == 2 - assert_allclose(linreg.intercept_, 3.) - assert_allclose(linreg.coef_, [1., 2.]) + assert_allclose(linreg.intercept_, 3.0) + assert_allclose(linreg.coef_, [1.0, 2.0]) diff --git a/sklearnex/spmd/__init__.py b/sklearnex/spmd/__init__.py index 9099df571a..3c698d694b 100644 --- a/sklearnex/spmd/__init__.py +++ b/sklearnex/spmd/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== __all__ = [ - 'basic_statistics', - 'cluster', - 'decomposition', - 'ensemble', - 'linear_model', - 'neighbors'] + "basic_statistics", + "cluster", + "decomposition", + "ensemble", + "linear_model", + "neighbors", +] diff --git a/sklearnex/spmd/basic_statistics/__init__.py b/sklearnex/spmd/basic_statistics/__init__.py index 6f45ecfe5c..2b99fdbdb7 100644 --- a/sklearnex/spmd/basic_statistics/__init__.py +++ b/sklearnex/spmd/basic_statistics/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .basic_statistics import BasicStatistics -__all__ = ['BasicStatistics'] +__all__ = ["BasicStatistics"] diff --git a/sklearnex/spmd/basic_statistics/basic_statistics.py b/sklearnex/spmd/basic_statistics/basic_statistics.py index fadc1686d2..9073c3d941 100644 --- a/sklearnex/spmd/basic_statistics/basic_statistics.py +++ b/sklearnex/spmd/basic_statistics/basic_statistics.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.basic_statistics import BasicStatistics diff --git a/sklearnex/spmd/cluster/__init__.py b/sklearnex/spmd/cluster/__init__.py index b94f1d3918..0c39935dc2 100644 --- a/sklearnex/spmd/cluster/__init__.py +++ b/sklearnex/spmd/cluster/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,13 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import daal_check_version -if daal_check_version((2023, 'P', 200)): +if daal_check_version((2023, "P", 200)): from .kmeans import KMeans - __all__ = ['KMeans'] + __all__ = ["KMeans"] else: __all__ = [] diff --git a/sklearnex/spmd/cluster/kmeans.py b/sklearnex/spmd/cluster/kmeans.py index cf614343e2..e8f97c576d 100644 --- a/sklearnex/spmd/cluster/kmeans.py +++ b/sklearnex/spmd/cluster/kmeans.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.cluster import KMeans diff --git a/sklearnex/spmd/decomposition/__init__.py b/sklearnex/spmd/decomposition/__init__.py index eda7b9fc14..618e0b9082 100644 --- a/sklearnex/spmd/decomposition/__init__.py +++ b/sklearnex/spmd/decomposition/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .pca import PCA -__all__ = ['PCA'] +__all__ = ["PCA"] diff --git a/sklearnex/spmd/decomposition/pca.py b/sklearnex/spmd/decomposition/pca.py index 5bf6eb63ab..bef34e3bbb 100644 --- a/sklearnex/spmd/decomposition/pca.py +++ b/sklearnex/spmd/decomposition/pca.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.decomposition import PCA diff --git a/sklearnex/spmd/ensemble/__init__.py b/sklearnex/spmd/ensemble/__init__.py index 5dcc919355..b53fb8f910 100644 --- a/sklearnex/spmd/ensemble/__init__.py +++ b/sklearnex/spmd/ensemble/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .forest import RandomForestClassifier, RandomForestRegressor -__all__ = ['RandomForestClassifier', 'RandomForestRegressor'] +__all__ = ["RandomForestClassifier", "RandomForestRegressor"] diff --git a/sklearnex/spmd/ensemble/forest.py b/sklearnex/spmd/ensemble/forest.py index cfb711f3d2..8eb77ac75a 100644 --- a/sklearnex/spmd/ensemble/forest.py +++ b/sklearnex/spmd/ensemble/forest.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,17 +12,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC from onedal.spmd.ensemble import RandomForestClassifier as onedal_RandomForestClassifier from onedal.spmd.ensemble import RandomForestRegressor as onedal_RandomForestRegressor -from ...preview.ensemble.forest import RandomForestClassifier as \ - RandomForestClassifier_Batch -from ...preview.ensemble.forest import RandomForestRegressor as \ - RandomForestRegressor_Batch +from ...preview.ensemble.forest import ( + RandomForestClassifier as RandomForestClassifier_Batch, +) +from ...preview.ensemble.forest import ( + RandomForestRegressor as RandomForestRegressor_Batch, +) class BaseForestSPMD(ABC): @@ -42,16 +44,18 @@ def _onedal_cpu_supported(self, method_name, *data): ready = super()._onedal_cpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready def _onedal_gpu_supported(self, method_name, *data): ready = super()._onedal_gpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready @@ -64,14 +68,16 @@ def _onedal_cpu_supported(self, method_name, *data): ready = super()._onedal_cpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready def _onedal_gpu_supported(self, method_name, *data): ready = super()._onedal_gpu_supported(method_name, *data) if not ready: raise RuntimeError( - f'Method {method_name} in {self.__class__.__name__} ' - 'is not supported with given inputs.') + f"Method {method_name} in {self.__class__.__name__} " + "is not supported with given inputs." + ) return ready diff --git a/sklearnex/spmd/linear_model/__init__.py b/sklearnex/spmd/linear_model/__init__.py index 33e882bdcb..893243cd56 100644 --- a/sklearnex/spmd/linear_model/__init__.py +++ b/sklearnex/spmd/linear_model/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .linear_model import LinearRegression -__all__ = ['LinearRegression'] +__all__ = ["LinearRegression"] diff --git a/sklearnex/spmd/linear_model/linear_model.py b/sklearnex/spmd/linear_model/linear_model.py index e179b3fdad..bf0814ca02 100644 --- a/sklearnex/spmd/linear_model/linear_model.py +++ b/sklearnex/spmd/linear_model/linear_model.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.linear_model import LinearRegression diff --git a/sklearnex/spmd/neighbors/__init__.py b/sklearnex/spmd/neighbors/__init__.py index 99099fa51c..11f104287a 100644 --- a/sklearnex/spmd/neighbors/__init__.py +++ b/sklearnex/spmd/neighbors/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors -__all__ = ['KNeighborsClassifier', 'KNeighborsRegressor', 'NearestNeighbors'] +__all__ = ["KNeighborsClassifier", "KNeighborsRegressor", "NearestNeighbors"] diff --git a/sklearnex/spmd/neighbors/neighbors.py b/sklearnex/spmd/neighbors/neighbors.py index 7eaa5e9f62..1fbe9c1bd1 100644 --- a/sklearnex/spmd/neighbors/neighbors.py +++ b/sklearnex/spmd/neighbors/neighbors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,12 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from onedal.spmd.neighbors import ( KNeighborsClassifier, KNeighborsRegressor, - NearestNeighbors + NearestNeighbors, ) # TODO: diff --git a/sklearnex/svm/__init__.py b/sklearnex/svm/__init__.py index 3a9aa066d9..1ec77833b7 100755 --- a/sklearnex/svm/__init__.py +++ b/sklearnex/svm/__init__.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,15 +13,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .._utils import get_sklearnex_version -if get_sklearnex_version((2021, 'P', 300)): - from .svr import SVR - from .svc import SVC - from .nusvr import NuSVR + +if get_sklearnex_version((2021, "P", 300)): from .nusvc import NuSVC - __all__ = ['SVR', 'SVC', 'NuSVC', 'NuSVR'] + from .nusvr import NuSVR + from .svc import SVC + from .svr import SVR + + __all__ = ["SVR", "SVC", "NuSVC", "NuSVR"] else: from daal4py.sklearn.svm import SVC - __all__ = ['SVC'] + + __all__ = ["SVC"] diff --git a/sklearnex/svm/_common.py b/sklearnex/svm/_common.py index 0220a226f8..36c4d6becf 100644 --- a/sklearnex/svm/_common.py +++ b/sklearnex/svm/_common.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from abc import ABC -import numpy as np -from daal4py.sklearn._utils import sklearn_check_version, PatchingConditionsChain +import numpy as np +from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import LabelEncoder -from sklearn.calibration import CalibratedClassifierCV +from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version from onedal.utils import _column_or_1d @@ -31,7 +31,7 @@ def get_dual_coef(self): def set_dual_coef(self, value): self.dual_coef_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.dual_coef_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model @@ -43,7 +43,7 @@ def get_intercept(self): def set_intercept(self, value): self._intercept_ = value - if hasattr(self, '_onedal_estimator'): + if hasattr(self, "_onedal_estimator"): self._onedal_estimator.intercept_ = value if not self._is_in_fit: del self._onedal_estimator._onedal_model @@ -51,31 +51,37 @@ def set_intercept(self, value): class BaseSVM(ABC): def _onedal_gpu_supported(self, method_name, *data): - patching_status = PatchingConditionsChain(f'sklearn.{method_name}') - patching_status.and_conditions([ - (False, 'GPU offloading is not supported.') - ]) + patching_status = PatchingConditionsChain(f"sklearn.{method_name}") + patching_status.and_conditions([(False, "GPU offloading is not supported.")]) return patching_status.get_status(logs=True) def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.svm.{class_name}.{method_name}') - if method_name == 'fit': - patching_status.and_conditions([ - (self.kernel in ['linear', 'rbf', 'poly', 'sigmoid'], - f'Kernel is "{self.kernel}" while ' - '"linear", "rbf", "poly" and "sigmoid" are only supported.') - ]) + f"sklearn.svm.{class_name}.{method_name}" + ) + if method_name == "fit": + patching_status.and_conditions( + [ + ( + self.kernel in ["linear", "rbf", "poly", "sigmoid"], + f'Kernel is "{self.kernel}" while ' + '"linear", "rbf", "poly" and "sigmoid" are only supported.', + ) + ] + ) return patching_status.get_status(logs=True) - inference_methods = ['predict'] if class_name.endswith('R') \ - else ['predict', 'predict_proba', 'decision_function'] + inference_methods = ( + ["predict"] + if class_name.endswith("R") + else ["predict", "predict_proba", "decision_function"] + ) if method_name in inference_methods: - patching_status.and_conditions([ - (hasattr(self, '_onedal_estimator'), 'oneDAL model was not trained.') - ]) + patching_status.and_conditions( + [(hasattr(self, "_onedal_estimator"), "oneDAL model was not trained.")] + ) return patching_status.get_status(logs=True) - raise RuntimeError(f'Unknown method {method_name} in {class_name}') + raise RuntimeError(f"Unknown method {method_name} in {class_name}") class BaseSVC(BaseSVM): @@ -92,37 +98,38 @@ def _compute_balanced_class_weight(self, y): return recip_freq[le.transform(classes)] def _fit_proba(self, X, y, sample_weight=None, queue=None): - from .._config import get_config, config_context + from .._config import config_context, get_config params = self.get_params() params["probability"] = False - params["decision_function_shape"] = 'ovr' + params["decision_function_shape"] = "ovr" clf_base = self.__class__(**params) # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue with config_context(**cfg): try: n_splits = 5 n_jobs = n_splits if queue is None or queue.sycl_device.is_cpu else 1 cv = StratifiedKFold( - n_splits=n_splits, - shuffle=True, - random_state=self.random_state) + n_splits=n_splits, shuffle=True, random_state=self.random_state + ) if sklearn_check_version("0.24"): self.clf_prob = CalibratedClassifierCV( - clf_base, ensemble=False, cv=cv, method='sigmoid', - n_jobs=n_jobs) + clf_base, ensemble=False, cv=cv, method="sigmoid", n_jobs=n_jobs + ) else: self.clf_prob = CalibratedClassifierCV( - clf_base, cv=cv, method='sigmoid') + clf_base, cv=cv, method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) except ValueError: clf_base = clf_base.fit(X, y, sample_weight) self.clf_prob = CalibratedClassifierCV( - clf_base, cv="prefit", method='sigmoid') + clf_base, cv="prefit", method="sigmoid" + ) self.clf_prob.fit(X, y, sample_weight) def _save_attributes(self): @@ -157,7 +164,7 @@ def _save_attributes(self): if sklearn_check_version("1.1"): length = int(len(self.classes_) * (len(self.classes_) - 1) / 2) - self.n_iter_ = np.full((length, ), self._onedal_estimator.n_iter_) + self.n_iter_ = np.full((length,), self._onedal_estimator.n_iter_) class BaseSVR(BaseSVM): diff --git a/sklearnex/svm/nusvc.py b/sklearnex/svm/nusvc.py index cba5aba42e..1eec55763a 100644 --- a/sklearnex/svm/nusvc.py +++ b/sklearnex/svm/nusvc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,16 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + +from sklearn.exceptions import NotFittedError +from sklearn.svm import NuSVC as sklearn_NuSVC +from sklearn.utils.validation import _deprecate_positional_args from daal4py.sklearn._utils import sklearn_check_version -from ._common import BaseSVC + from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVC -from sklearn.svm import NuSVC as sklearn_NuSVC -from sklearn.utils.validation import _deprecate_positional_args -from sklearn.exceptions import NotFittedError -if sklearn_check_version('1.0'): +if sklearn_check_version("1.0"): from sklearn.utils.metaestimators import available_if from onedal.svm import NuSVC as onedal_NuSVC @@ -30,21 +32,46 @@ class NuSVC(sklearn_NuSVC, BaseSVC): __doc__ = sklearn_NuSVC.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_NuSVC._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): + def __init__( + self, + *, + nu=0.5, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): super().__init__( - nu=nu, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, - shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, - decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) + nu=nu, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) def fit(self, X, y, sample_weight=None): """ @@ -84,10 +111,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_NuSVC.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_NuSVC.fit, + }, + X, + y, + sample_weight, + ) return self @@ -111,12 +145,18 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_NuSVC.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_NuSVC.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @available_if(sklearn_NuSVC._check_proba) def predict_proba(self, X): """ @@ -146,7 +186,9 @@ def predict_proba(self, X): datasets. """ return self._predict_proba(X) + else: + @property def predict_proba(self): self._check_proba() @@ -156,38 +198,50 @@ def predict_proba(self): def _predict_proba(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - sklearn_pred_proba = (sklearn_NuSVC.predict_proba - if sklearn_check_version("1.0") - else sklearn_NuSVC._predict_proba) - - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_pred_proba, - }, X) + sklearn_pred_proba = ( + sklearn_NuSVC.predict_proba + if sklearn_check_version("1.0") + else sklearn_NuSVC._predict_proba + ) + + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_pred_proba, + }, + X, + ) @wrap_output_data def decision_function(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'decision_function', { - 'onedal': self.__class__._onedal_decision_function, - 'sklearn': sklearn_NuSVC.decision_function, - }, X) + return dispatch( + self, + "decision_function", + { + "onedal": self.__class__._onedal_decision_function, + "sklearn": sklearn_NuSVC.decision_function, + }, + X, + ) def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'nu': self.nu, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, - 'class_weight': self.class_weight, - 'break_ties': self.break_ties, - 'decision_function_shape': self.decision_function_shape, + "nu": self.nu, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, + "class_weight": self.class_weight, + "break_ties": self.break_ties, + "decision_function_shape": self.decision_function_shape, } self._onedal_estimator = onedal_NuSVC(**onedal_params) @@ -201,15 +255,16 @@ def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) def _onedal_predict_proba(self, X, queue=None): - if getattr(self, 'clf_prob', None) is None: + if getattr(self, "clf_prob", None) is None: raise NotFittedError( - "predict_proba is not available when fitted with probability=False") - from .._config import get_config, config_context + "predict_proba is not available when fitted with probability=False" + ) + from .._config import config_context, get_config # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue with config_context(**cfg): return self.clf_prob.predict_proba(X) diff --git a/sklearnex/svm/nusvr.py b/sklearnex/svm/nusvr.py index 75f14f1e69..837da54beb 100644 --- a/sklearnex/svm/nusvr.py +++ b/sklearnex/svm/nusvr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,53 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== - -from daal4py.sklearn._utils import sklearn_check_version -from ._common import BaseSVR -from .._device_offload import dispatch, wrap_output_data +# =============================================================================== from sklearn.svm import NuSVR as sklearn_NuSVR from sklearn.utils.validation import _deprecate_positional_args + +from daal4py.sklearn._utils import sklearn_check_version from onedal.svm import NuSVR as onedal_NuSVR +from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVR + class NuSVR(sklearn_NuSVR, BaseSVR): __doc__ = sklearn_NuSVR.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_NuSVR._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, tol=1e-3, C=1.0, nu=0.5, shrinking=True, - cache_size=200, verbose=False, max_iter=-1): + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + C=1.0, + nu=0.5, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): super().__init__( - kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, nu=nu, - shrinking=shrinking, cache_size=cache_size, verbose=verbose, - max_iter=max_iter) + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + nu=nu, + shrinking=shrinking, + cache_size=cache_size, + verbose=verbose, + max_iter=max_iter, + ) def fit(self, X, y, sample_weight=None): """ @@ -76,10 +98,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_NuSVR.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_NuSVR.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -102,23 +131,28 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_NuSVR.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_NuSVR.predict, + }, + X, + ) def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'C': self.C, - 'nu': self.nu, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, + "C": self.C, + "nu": self.nu, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, } self._onedal_estimator = onedal_NuSVR(**onedal_params) diff --git a/sklearnex/svm/svc.py b/sklearnex/svm/svc.py index ff4f1c3466..816502cc20 100644 --- a/sklearnex/svm/svc.py +++ b/sklearnex/svm/svc.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,19 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from scipy import sparse as sp +from sklearn.exceptions import NotFittedError +from sklearn.svm import SVC as sklearn_SVC +from sklearn.utils.validation import _deprecate_positional_args + +from daal4py.sklearn._utils import PatchingConditionsChain, sklearn_check_version -from daal4py.sklearn._utils import sklearn_check_version, PatchingConditionsChain -from ._common import BaseSVC from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVC -from sklearn.svm import SVC as sklearn_SVC -from sklearn.utils.validation import _deprecate_positional_args -from sklearn.exceptions import NotFittedError -if sklearn_check_version('1.0'): +if sklearn_check_version("1.0"): from sklearn.utils.metaestimators import available_if from onedal.svm import SVC as onedal_SVC @@ -33,21 +34,46 @@ class SVC(sklearn_SVC, BaseSVC): __doc__ = sklearn_SVC.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_SVC._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, shrinking=True, probability=False, - tol=1e-3, cache_size=200, class_weight=None, - verbose=False, max_iter=-1, decision_function_shape='ovr', - break_ties=False, random_state=None): + def __init__( + self, + *, + C=1.0, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + shrinking=True, + probability=False, + tol=1e-3, + cache_size=200, + class_weight=None, + verbose=False, + max_iter=-1, + decision_function_shape="ovr", + break_ties=False, + random_state=None, + ): super().__init__( - C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, - shrinking=shrinking, probability=probability, tol=tol, cache_size=cache_size, - class_weight=class_weight, verbose=verbose, max_iter=max_iter, - decision_function_shape=decision_function_shape, break_ties=break_ties, - random_state=random_state) + C=C, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + shrinking=shrinking, + probability=probability, + tol=tol, + cache_size=cache_size, + class_weight=class_weight, + verbose=verbose, + max_iter=max_iter, + decision_function_shape=decision_function_shape, + break_ties=break_ties, + random_state=random_state, + ) def fit(self, X, y, sample_weight=None): """ @@ -87,10 +113,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_SVC.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_SVC.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -113,12 +146,18 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_SVC.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_SVC.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @available_if(sklearn_SVC._check_proba) def predict_proba(self, X): """ @@ -148,7 +187,9 @@ def predict_proba(self, X): datasets. """ return self._predict_proba(X) + else: + @property def predict_proba(self): self._check_proba() @@ -156,64 +197,88 @@ def predict_proba(self): @wrap_output_data def _predict_proba(self, X): - sklearn_pred_proba = (sklearn_SVC.predict_proba - if sklearn_check_version("1.0") - else sklearn_SVC._predict_proba) - - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_pred_proba, - }, X) + sklearn_pred_proba = ( + sklearn_SVC.predict_proba + if sklearn_check_version("1.0") + else sklearn_SVC._predict_proba + ) + + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_pred_proba, + }, + X, + ) @wrap_output_data def decision_function(self, X): if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'decision_function', { - 'onedal': self.__class__._onedal_decision_function, - 'sklearn': sklearn_SVC.decision_function, - }, X) + return dispatch( + self, + "decision_function", + { + "onedal": self.__class__._onedal_decision_function, + "sklearn": sklearn_SVC.decision_function, + }, + X, + ) def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ patching_status = PatchingConditionsChain( - f'sklearn.svm.{class_name}.{method_name}') - if method_name == 'fit': + f"sklearn.svm.{class_name}.{method_name}" + ) + if method_name == "fit": if len(data) > 1: self._class_count = len(np.unique(data[1])) self._is_sparse = sp.isspmatrix(data[0]) - patching_status.and_conditions([ - (self.kernel in ['linear', 'rbf'], - f'Kernel is "{self.kernel}" while ' - '"linear" and "rbf" are only supported on GPU.'), - (self.class_weight is None, 'Class weight is not supported on GPU.'), - (self._class_count == 2, 'Multiclassification is not supported on GPU.'), - (not self._is_sparse, 'Sparse input is not supported on GPU.') - ]) + patching_status.and_conditions( + [ + ( + self.kernel in ["linear", "rbf"], + f'Kernel is "{self.kernel}" while ' + '"linear" and "rbf" are only supported on GPU.', + ), + (self.class_weight is None, "Class weight is not supported on GPU."), + ( + self._class_count == 2, + "Multiclassification is not supported on GPU.", + ), + (not self._is_sparse, "Sparse input is not supported on GPU."), + ] + ) return patching_status.get_status(logs=True) - if method_name in ['predict', 'predict_proba', 'decision_function']: - patching_status.and_conditions([ - (hasattr(self, '_onedal_estimator') and self._onedal_gpu_supported( - 'fit', *data), - 'oneDAL model was not trained on GPU.') - ]) + if method_name in ["predict", "predict_proba", "decision_function"]: + patching_status.and_conditions( + [ + ( + hasattr(self, "_onedal_estimator") + and self._onedal_gpu_supported("fit", *data), + "oneDAL model was not trained on GPU.", + ) + ] + ) return patching_status.get_status(logs=True) - raise RuntimeError(f'Unknown method {method_name} in {class_name}') + raise RuntimeError(f"Unknown method {method_name} in {class_name}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'C': self.C, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, - 'class_weight': self.class_weight, - 'break_ties': self.break_ties, - 'decision_function_shape': self.decision_function_shape, + "C": self.C, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, + "class_weight": self.class_weight, + "break_ties": self.break_ties, + "decision_function_shape": self.decision_function_shape, } self._onedal_estimator = onedal_SVC(**onedal_params) @@ -227,15 +292,16 @@ def _onedal_predict(self, X, queue=None): return self._onedal_estimator.predict(X, queue=queue) def _onedal_predict_proba(self, X, queue=None): - if getattr(self, 'clf_prob', None) is None: + if getattr(self, "clf_prob", None) is None: raise NotFittedError( - "predict_proba is not available when fitted with probability=False") - from .._config import get_config, config_context + "predict_proba is not available when fitted with probability=False" + ) + from .._config import config_context, get_config # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue with config_context(**cfg): return self.clf_prob.predict_proba(X) diff --git a/sklearnex/svm/svr.py b/sklearnex/svm/svr.py index c47bc3334c..b841a1a512 100644 --- a/sklearnex/svm/svr.py +++ b/sklearnex/svm/svr.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,53 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== - -from daal4py.sklearn._utils import sklearn_check_version -from ._common import BaseSVR -from .._device_offload import dispatch, wrap_output_data +# =============================================================================== from sklearn.svm import SVR as sklearn_SVR from sklearn.utils.validation import _deprecate_positional_args + +from daal4py.sklearn._utils import sklearn_check_version from onedal.svm import SVR as onedal_SVR +from .._device_offload import dispatch, wrap_output_data +from ._common import BaseSVR + class SVR(sklearn_SVR, BaseSVR): __doc__ = sklearn_SVR.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = {**sklearn_SVR._parameter_constraints} @_deprecate_positional_args - def __init__(self, *, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, - cache_size=200, verbose=False, max_iter=-1): + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + C=1.0, + epsilon=0.1, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): super().__init__( - kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=C, - epsilon=epsilon, shrinking=shrinking, cache_size=cache_size, verbose=verbose, - max_iter=max_iter) + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=C, + epsilon=epsilon, + shrinking=shrinking, + cache_size=cache_size, + verbose=verbose, + max_iter=max_iter, + ) def fit(self, X, y, sample_weight=None): """ @@ -76,10 +98,17 @@ def fit(self, X, y, sample_weight=None): self._validate_params() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_SVR.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_SVR.fit, + }, + X, + y, + sample_weight, + ) return self @@ -103,23 +132,28 @@ def predict(self, X): """ if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_SVR.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_SVR.predict, + }, + X, + ) def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params = { - 'C': self.C, - 'epsilon': self.epsilon, - 'kernel': self.kernel, - 'degree': self.degree, - 'gamma': self.gamma, - 'coef0': self.coef0, - 'tol': self.tol, - 'shrinking': self.shrinking, - 'cache_size': self.cache_size, - 'max_iter': self.max_iter, + "C": self.C, + "epsilon": self.epsilon, + "kernel": self.kernel, + "degree": self.degree, + "gamma": self.gamma, + "coef0": self.coef0, + "tol": self.tol, + "shrinking": self.shrinking, + "cache_size": self.cache_size, + "max_iter": self.max_iter, } self._onedal_estimator = onedal_SVR(**onedal_params) diff --git a/sklearnex/svm/tests/test_svm.py b/sklearnex/svm/tests/test_svm.py index 9ce49fc67d..6fcfb3718c 100755 --- a/sklearnex/svm/tests/test_svm.py +++ b/sklearnex/svm/tests/test_svm.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np from numpy.testing import assert_allclose @@ -21,43 +21,43 @@ def test_sklearnex_import_svc(): from sklearnex.svm import SVC - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = SVC(kernel='linear').fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ - assert_allclose(svc.dual_coef_, [[-0.25, .25]]) + svc = SVC(kernel="linear").fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ + assert_allclose(svc.dual_coef_, [[-0.25, 0.25]]) assert_allclose(svc.support_, [1, 3]) def test_sklearnex_import_nusvc(): from sklearnex.svm import NuSVC - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = NuSVC(kernel='linear').fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ + svc = NuSVC(kernel="linear").fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ assert_allclose(svc.dual_coef_, [[-0.04761905, -0.0952381, 0.0952381, 0.04761905]]) assert_allclose(svc.support_, [0, 1, 3, 4]) def test_sklearnex_import_svr(): from sklearnex.svm import SVR - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = SVR(kernel='linear').fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ + svc = SVR(kernel="linear").fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ assert_allclose(svc.dual_coef_, [[-0.1, 0.1]]) assert_allclose(svc.support_, [1, 3]) def test_sklearnex_import_nusvr(): from sklearnex.svm import NuSVR - X = np.array([[-2, -1], [-1, -1], [-1, -2], - [+1, +1], [+1, +2], [+2, +1]]) + + X = np.array([[-2, -1], [-1, -1], [-1, -2], [+1, +1], [+1, +2], [+2, +1]]) y = np.array([1, 1, 1, 2, 2, 2]) - svc = NuSVR(kernel='linear', nu=0.9).fit(X, y) - assert 'daal4py' in svc.__module__ or 'sklearnex' in svc.__module__ - assert_allclose(svc.dual_coef_, [[-1., 0.611111, 1., -0.611111]], rtol=1e-3) + svc = NuSVR(kernel="linear", nu=0.9).fit(X, y) + assert "daal4py" in svc.__module__ or "sklearnex" in svc.__module__ + assert_allclose(svc.dual_coef_, [[-1.0, 0.611111, 1.0, -0.611111]], rtol=1e-3) assert_allclose(svc.support_, [1, 2, 3, 5]) diff --git a/sklearnex/tests/_models_info.py b/sklearnex/tests/_models_info.py index 309feb78a6..afe213d569 100755 --- a/sklearnex/tests/_models_info.py +++ b/sklearnex/tests/_models_info.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,115 +12,126 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import numpy as np - -from sklearn.svm import SVC -from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor) -from sklearn.neighbors import ( - KNeighborsClassifier, - KNeighborsRegressor, - NearestNeighbors, - LocalOutlierFactor) +from sklearn.cluster import DBSCAN, KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import ( + ElasticNet, + Lasso, + LinearRegression, LogisticRegression, LogisticRegressionCV, - LinearRegression, Ridge, - ElasticNet, - Lasso) -from sklearn.cluster import (KMeans, DBSCAN) +) from sklearn.manifold import TSNE -from sklearn.decomposition import PCA +from sklearn.neighbors import ( + KNeighborsClassifier, + KNeighborsRegressor, + LocalOutlierFactor, + NearestNeighbors, +) +from sklearn.svm import SVC MODELS_INFO = [ { - 'model': KNeighborsClassifier(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": KNeighborsClassifier(algorithm="brute"), + "methods": ["kneighbors", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KNeighborsRegressor(algorithm='brute'), - 'methods': ['kneighbors', 'predict', 'score'], - 'dataset': 'regression', + "model": KNeighborsRegressor(algorithm="brute"), + "methods": ["kneighbors", "predict", "score"], + "dataset": "regression", }, { - 'model': NearestNeighbors(algorithm='brute'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(algorithm="brute"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(novelty=False), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(novelty=False), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(novelty=True), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(novelty=True), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': DBSCAN(), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": DBSCAN(), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': SVC(probability=True), - 'methods': ['decision_function', 'predict', 'predict_proba', 'score'], - 'dataset': 'classifier', + "model": SVC(probability=True), + "methods": ["decision_function", "predict", "predict_proba", "score"], + "dataset": "classifier", }, { - 'model': KMeans(), - 'methods': ['fit_predict', 'fit_transform', 'transform', 'predict', 'score'], - 'dataset': 'blobs', + "model": KMeans(), + "methods": ["fit_predict", "fit_transform", "transform", "predict", "score"], + "dataset": "blobs", }, { - 'model': ElasticNet(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": ElasticNet(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Lasso(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Lasso(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': PCA(), - 'methods': ['fit_transform', 'transform', 'score'], - 'dataset': 'classifier', + "model": PCA(), + "methods": ["fit_transform", "transform", "score"], + "dataset": "classifier", }, { - 'model': LogisticRegression(max_iter=100, multi_class='multinomial'), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegression(max_iter=100, multi_class="multinomial"), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(max_iter=100), - 'methods': ['decision_function', 'predict', 'predict_proba', - 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": LogisticRegressionCV(max_iter=100), + "methods": [ + "decision_function", + "predict", + "predict_proba", + "predict_log_proba", + "score", + ], + "dataset": "classifier", }, { - 'model': RandomForestClassifier(n_estimators=10), - 'methods': ['predict', 'predict_proba', 'predict_log_proba', 'score'], - 'dataset': 'classifier', + "model": RandomForestClassifier(n_estimators=10), + "methods": ["predict", "predict_proba", "predict_log_proba", "score"], + "dataset": "classifier", }, { - 'model': RandomForestRegressor(n_estimators=10), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": RandomForestRegressor(n_estimators=10), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': LinearRegression(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": LinearRegression(), + "methods": ["predict", "score"], + "dataset": "regression", }, { - 'model': Ridge(), - 'methods': ['predict', 'score'], - 'dataset': 'regression', + "model": Ridge(), + "methods": ["predict", "score"], + "dataset": "regression", }, ] @@ -140,20 +151,20 @@ TO_SKIP = [ # --------------- NO INFO --------------- - r'KMeans .*transform', - r'KMeans .*score', - r'PCA .*score', - r'LogisticRegression .*decision_function', - r'LogisticRegressionCV .*decision_function', - r'LogisticRegressionCV .*predict', - r'LogisticRegressionCV .*predict_proba', - r'LogisticRegressionCV .*predict_log_proba', - r'LogisticRegressionCV .*score', + r"KMeans .*transform", + r"KMeans .*score", + r"PCA .*score", + r"LogisticRegression .*decision_function", + r"LogisticRegressionCV .*decision_function", + r"LogisticRegressionCV .*predict", + r"LogisticRegressionCV .*predict_proba", + r"LogisticRegressionCV .*predict_log_proba", + r"LogisticRegressionCV .*score", # --------------- Scikit --------------- - r'Ridge float16 predict', - r'Ridge float16 score', - r'RandomForestClassifier .*predict_proba', - r'RandomForestClassifier .*predict_log_proba', - r'pairwise_distances .*pairwise_distances', # except float64 - r'roc_auc_score .*roc_auc_score' + r"Ridge float16 predict", + r"Ridge float16 score", + r"RandomForestClassifier .*predict_proba", + r"RandomForestClassifier .*predict_log_proba", + r"pairwise_distances .*pairwise_distances", # except float64 + r"roc_auc_score .*roc_auc_score", ] diff --git a/sklearnex/tests/test_config.py b/sklearnex/tests/test_config.py index 0a18b00dd3..36f5e82b0a 100644 --- a/sklearnex/tests/test_config.py +++ b/sklearnex/tests/test_config.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,9 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sklearn + import sklearnex @@ -27,12 +28,12 @@ def test_get_config_contains_sklearn_params(): def test_set_config_works(): default_config = sklearnex.get_config() - sklearnex.set_config(assume_finite=True, - target_offload='cpu:0', - allow_fallback_to_host=True) + sklearnex.set_config( + assume_finite=True, target_offload="cpu:0", allow_fallback_to_host=True + ) config = sklearnex.get_config() - assert config['target_offload'] == 'cpu:0' - assert config['allow_fallback_to_host'] - assert config['assume_finite'] + assert config["target_offload"] == "cpu:0" + assert config["allow_fallback_to_host"] + assert config["assume_finite"] sklearnex.set_config(**default_config) diff --git a/sklearnex/tests/test_memory_usage.py b/sklearnex/tests/test_memory_usage.py index fa02df6f5b..bd7d87bd51 100644 --- a/sklearnex/tests/test_memory_usage.py +++ b/sklearnex/tests/test_memory_usage.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import pytest -import types +import gc +import logging import tracemalloc +import types + +import numpy as np +import pandas as pd +import pytest +from scipy.stats import pearsonr +from sklearn.base import BaseEstimator +from sklearn.datasets import make_classification +from sklearn.model_selection import KFold + from sklearnex import get_patch_map -from sklearnex.model_selection import train_test_split -from sklearnex.utils import _assert_all_finite from sklearnex.metrics import pairwise_distances, roc_auc_score +from sklearnex.model_selection import train_test_split from sklearnex.preview.decomposition import PCA as PreviewPCA -from sklearnex.preview.linear_model import LinearRegression as PreviewLinearRegression +from sklearnex.preview.ensemble import ExtraTreesClassifier as PreviewExtraTreesClassifier +from sklearnex.preview.ensemble import ExtraTreesRegressor as PreviewExtraTreesRegressor from sklearnex.preview.ensemble import ( RandomForestClassifier as PreviewRandomForestClassifier, +) +from sklearnex.preview.ensemble import ( RandomForestRegressor as PreviewRandomForestRegressor, - ExtraTreesClassifier as PreviewExtraTreesClassifier, - ExtraTreesRegressor as PreviewExtraTreesRegressor ) -from sklearn.base import BaseEstimator -from sklearn.model_selection import KFold -from sklearn.datasets import make_classification -from scipy.stats import pearsonr -import pandas as pd -import numpy as np -import gc -import logging +from sklearnex.preview.linear_model import LinearRegression as PreviewLinearRegression +from sklearnex.utils import _assert_all_finite class TrainTestSplitEstimator: @@ -63,12 +67,12 @@ def fit(self, x, y): class CosineDistancesEstimator(PairwiseDistancesEstimator): def __init__(self): - self.metric = 'cosine' + self.metric = "cosine" class CorrelationDistancesEstimator(PairwiseDistancesEstimator): def __init__(self): - self.metric = 'correlation' + self.metric = "correlation" class RocAucEstimator: @@ -87,21 +91,21 @@ def get_patched_estimators(ban_list, output_list): if not isinstance(estimator, types.FunctionType): if name not in ban_list: if isinstance(estimator(), BaseEstimator): - if hasattr(estimator, 'fit'): + if hasattr(estimator, "fit"): output_list.append(estimator) def remove_duplicated_estimators(estimators_list): estimators_map = {} for estimator in estimators_list: - full_name = f'{estimator.__module__}.{estimator.__name__}' + full_name = f"{estimator.__module__}.{estimator.__name__}" estimators_map[full_name] = estimator return estimators_map.values() BANNED_ESTIMATORS = ( - 'LocalOutlierFactor', # fails on ndarray_c for sklearn > 1.0 - 'TSNE', # too slow for using in testing on common data size + "LocalOutlierFactor", # fails on ndarray_c for sklearn > 1.0 + "TSNE", # too slow for using in testing on common data size ) estimators = [ PreviewPCA, @@ -114,7 +118,7 @@ def remove_duplicated_estimators(estimators_list): FiniteCheckEstimator, CosineDistancesEstimator, CorrelationDistancesEstimator, - RocAucEstimator + RocAucEstimator, ] get_patched_estimators(BANNED_ESTIMATORS, estimators) estimators = remove_duplicated_estimators(estimators) @@ -136,17 +140,9 @@ def dataframe_f(x, y): return pd.DataFrame(np.asfortranarray(x)), pd.Series(y) -data_transforms = [ - ndarray_c, - ndarray_f, - dataframe_c, - dataframe_f -] +data_transforms = [ndarray_c, ndarray_f, dataframe_c, dataframe_f] -data_shapes = [ - (1000, 100), - (2000, 50) -] +data_shapes = [(1000, 100), (2000, 50)] EXTRA_MEMORY_THRESHOLD = 0.15 N_SPLITS = 10 @@ -154,9 +150,13 @@ def dataframe_f(x, y): def gen_clsf_data(n_samples, n_features): data, label = make_classification( - n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777) - return data, label, \ - data.size * data.dtype.itemsize + label.size * label.dtype.itemsize + n_classes=2, n_samples=n_samples, n_features=n_features, random_state=777 + ) + return ( + data, + label, + data.size * data.dtype.itemsize + label.size * label.dtype.itemsize, + ) def split_train_inference(kf, x, y, estimator): @@ -172,11 +172,11 @@ def split_train_inference(kf, x, y, estimator): # fallback to stock scikit-learn with default parameters alg = estimator() alg.fit(x_train, y_train) - if hasattr(alg, 'predict'): + if hasattr(alg, "predict"): alg.predict(x_test) - elif hasattr(alg, 'transform'): + elif hasattr(alg, "transform"): alg.transform(x_test) - elif hasattr(alg, 'kneighbors'): + elif hasattr(alg, "kneighbors"): alg.kneighbors(x_test) del alg, x_train, x_test, y_train, y_test mem_tracks.append(tracemalloc.get_traced_memory()[0]) @@ -194,38 +194,46 @@ def _kfold_function_template(estimator, data_transform_function, data_shape): mem_before, _ = tracemalloc.get_traced_memory() mem_tracks = split_train_inference(kf, x, y, estimator) - mem_iter_diffs = (np.array(mem_tracks[1:]) - np.array(mem_tracks[:-1])) + mem_iter_diffs = np.array(mem_tracks[1:]) - np.array(mem_tracks[:-1]) mem_incr_mean, mem_incr_std = mem_iter_diffs.mean(), mem_iter_diffs.std() mem_incr_mean, mem_incr_std = round(mem_incr_mean), round(mem_incr_std) mem_iter_corr, _ = pearsonr(mem_tracks, list(range(len(mem_tracks)))) if mem_iter_corr > 0.95: - logging.warning('Memory usage is steadily increasing with iterations ' - '(Pearson correlation coefficient between ' - f'memory tracks and iterations is {mem_iter_corr})\n' - 'Memory usage increase per iteration: ' - f'{mem_incr_mean}±{mem_incr_std} bytes') + logging.warning( + "Memory usage is steadily increasing with iterations " + "(Pearson correlation coefficient between " + f"memory tracks and iterations is {mem_iter_corr})\n" + "Memory usage increase per iteration: " + f"{mem_incr_mean}±{mem_incr_std} bytes" + ) mem_before_gc, _ = tracemalloc.get_traced_memory() mem_diff = mem_before_gc - mem_before - message = 'Size of extra allocated memory {} using garbage collector ' \ - f'is greater than {EXTRA_MEMORY_THRESHOLD * 100}% of input data' \ - f'\n\tAlgorithm: {estimator.__name__}' \ - f'\n\tInput data size: {data_memory_size} bytes' \ - '\n\tExtra allocated memory size: {} bytes' \ - ' / {} %' + message = ( + "Size of extra allocated memory {} using garbage collector " + f"is greater than {EXTRA_MEMORY_THRESHOLD * 100}% of input data" + f"\n\tAlgorithm: {estimator.__name__}" + f"\n\tInput data size: {data_memory_size} bytes" + "\n\tExtra allocated memory size: {} bytes" + " / {} %" + ) if mem_diff >= EXTRA_MEMORY_THRESHOLD * data_memory_size: - logging.warning(message.format( - 'before', mem_diff, round((mem_diff) / data_memory_size * 100, 2))) + logging.warning( + message.format( + "before", mem_diff, round((mem_diff) / data_memory_size * 100, 2) + ) + ) gc.collect() mem_after, _ = tracemalloc.get_traced_memory() tracemalloc.stop() mem_diff = mem_after - mem_before - assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, \ - message.format('after', mem_diff, round((mem_diff) / data_memory_size * 100, 2)) + assert mem_diff < EXTRA_MEMORY_THRESHOLD * data_memory_size, message.format( + "after", mem_diff, round((mem_diff) / data_memory_size * 100, 2) + ) -@pytest.mark.parametrize('data_transform_function', data_transforms) -@pytest.mark.parametrize('estimator', estimators) -@pytest.mark.parametrize('data_shape', data_shapes) +@pytest.mark.parametrize("data_transform_function", data_transforms) +@pytest.mark.parametrize("estimator", estimators) +@pytest.mark.parametrize("data_shape", data_shapes) def test_memory_leaks(estimator, data_transform_function, data_shape): _kfold_function_template(estimator, data_transform_function, data_shape) diff --git a/sklearnex/tests/test_monkeypatch.py b/sklearnex/tests/test_monkeypatch.py index 96de29f698..bcb91f0f99 100755 --- a/sklearnex/tests/test_monkeypatch.py +++ b/sklearnex/tests/test_monkeypatch.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== import sklearnex from daal4py.sklearn._utils import daal_check_version @@ -35,9 +35,9 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert \ - class_module.startswith('daal4py') or class_module.startswith('sklearnex'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py") or class_module.startswith( + "sklearnex" + ), "Patching has completed with error." for i, _ in enumerate(_tokens): t = _tokens[i] @@ -46,8 +46,7 @@ def test_monkey_patching(): sklearnex.unpatch_sklearn(t) class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." sklearnex.unpatch_sklearn() @@ -57,8 +56,7 @@ def test_monkey_patching(): n = _classes[i][1] class_module = getattr(p, n).__module__ - assert class_module.startswith('sklearn'), \ - "Unpatching has completed with error." + assert class_module.startswith("sklearn"), "Unpatching has completed with error." sklearnex.unpatch_sklearn() @@ -70,9 +68,9 @@ def test_monkey_patching(): sklearnex.patch_sklearn(t) class_module = getattr(p, n).__module__ - assert \ - class_module.startswith('daal4py') or class_module.startswith('sklearnex'), \ - "Patching has completed with error." + assert class_module.startswith("daal4py") or class_module.startswith( + "sklearnex" + ), "Patching has completed with error." sklearnex.unpatch_sklearn() @@ -81,14 +79,14 @@ def test_patch_by_list_simple(): sklearnex.patch_sklearn(["LogisticRegression"]) from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('sklearn') - assert KNeighborsRegressor.__module__.startswith('sklearn') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('sklearn') + assert RandomForestRegressor.__module__.startswith("sklearn") + assert KNeighborsRegressor.__module__.startswith("sklearn") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("sklearn") sklearnex.unpatch_sklearn() @@ -97,14 +95,14 @@ def test_patch_by_list_many_estimators(): sklearnex.patch_sklearn(["LogisticRegression", "SVC"]) from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('sklearn') - assert KNeighborsRegressor.__module__.startswith('sklearn') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith('sklearnex') + assert RandomForestRegressor.__module__.startswith("sklearn") + assert KNeighborsRegressor.__module__.startswith("sklearn") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("daal4py") or SVC.__module__.startswith("sklearnex") sklearnex.unpatch_sklearn() @@ -113,31 +111,32 @@ def test_unpatch_by_list_many_estimators(): sklearnex.patch_sklearn() from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('daal4py') - assert KNeighborsRegressor.__module__.startswith('daal4py') or \ - KNeighborsRegressor.__module__.startswith('sklearnex') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith('sklearnex') + assert RandomForestRegressor.__module__.startswith("daal4py") + assert KNeighborsRegressor.__module__.startswith( + "daal4py" + ) or KNeighborsRegressor.__module__.startswith("sklearnex") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("daal4py") or SVC.__module__.startswith("sklearnex") sklearnex.unpatch_sklearn(["KNeighborsRegressor", "RandomForestRegressor"]) from sklearn.ensemble import RandomForestRegressor - from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression + from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVC - assert RandomForestRegressor.__module__.startswith('sklearn') - assert KNeighborsRegressor.__module__.startswith('sklearn') - assert LogisticRegression.__module__.startswith('daal4py') - assert SVC.__module__.startswith('daal4py') or SVC.__module__.startswith('sklearnex') + assert RandomForestRegressor.__module__.startswith("sklearn") + assert KNeighborsRegressor.__module__.startswith("sklearn") + assert LogisticRegression.__module__.startswith("daal4py") + assert SVC.__module__.startswith("daal4py") or SVC.__module__.startswith("sklearnex") def test_patching_checker(): - for name in [None, 'SVC', 'PCA']: + for name in [None, "SVC", "PCA"]: sklearnex.patch_sklearn(name=name) assert sklearnex.sklearn_is_patched(name=name) @@ -159,10 +158,10 @@ def test_patching_checker(): def test_preview_namespace(): def get_estimators(): + from sklearn.cluster import DBSCAN + from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression - from sklearn.decomposition import PCA - from sklearn.cluster import DBSCAN from sklearn.svm import SVC return LinearRegression(), PCA(), DBSCAN(), SVC(), RandomForestClassifier() @@ -175,34 +174,34 @@ def get_estimators(): assert sklearnex.dispatcher._is_preview_enabled() lr, pca, dbscan, svc, rfc = get_estimators() - assert 'sklearnex.preview' in rfc.__module__ + assert "sklearnex.preview" in rfc.__module__ - if daal_check_version((2023, 'P', 100)): - assert 'sklearnex.preview' in lr.__module__ + if daal_check_version((2023, "P", 100)): + assert "sklearnex.preview" in lr.__module__ else: - assert 'daal4py' in lr.__module__ + assert "daal4py" in lr.__module__ - assert 'sklearnex.preview' in pca.__module__ - assert 'daal4py' in dbscan.__module__ - assert 'sklearnex' in svc.__module__ + assert "sklearnex.preview" in pca.__module__ + assert "daal4py" in dbscan.__module__ + assert "sklearnex" in svc.__module__ sklearnex.unpatch_sklearn() # no patching behavior lr, pca, dbscan, svc, rfc = get_estimators() - assert 'sklearn.' in lr.__module__ - assert 'sklearn.' in pca.__module__ - assert 'sklearn.' in dbscan.__module__ - assert 'sklearn.' in svc.__module__ - assert 'sklearn.' in rfc.__module__ + assert "sklearn." in lr.__module__ + assert "sklearn." in pca.__module__ + assert "sklearn." in dbscan.__module__ + assert "sklearn." in svc.__module__ + assert "sklearn." in rfc.__module__ # default patching behavior sklearnex.patch_sklearn() assert not sklearnex.dispatcher._is_preview_enabled() lr, pca, dbscan, svc, rfc = get_estimators() - assert 'daal4py' in lr.__module__ - assert 'daal4py' in pca.__module__ - assert 'daal4py' in rfc.__module__ - assert 'daal4py' in dbscan.__module__ - assert 'sklearnex' in svc.__module__ + assert "daal4py" in lr.__module__ + assert "daal4py" in pca.__module__ + assert "daal4py" in rfc.__module__ + assert "daal4py" in dbscan.__module__ + assert "sklearnex" in svc.__module__ sklearnex.unpatch_sklearn() diff --git a/sklearnex/tests/test_patching.py b/sklearnex/tests/test_patching.py index c739f3b6d1..cd0c5d2fca 100755 --- a/sklearnex/tests/test_patching.py +++ b/sklearnex/tests/test_patching.py @@ -99,8 +99,11 @@ def _load_all_models(patched): models = [] for patch_infos in get_patch_map().values(): maybe_class = getattr(patch_infos[0][0][0], patch_infos[0][0][1]) - if maybe_class is not None and isclass(maybe_class) and \ - issubclass(maybe_class, BaseEstimator): + if ( + maybe_class is not None + and isclass(maybe_class) + and issubclass(maybe_class, BaseEstimator) + ): models.append(maybe_class()) if patched: @@ -113,9 +116,7 @@ def _load_all_models(patched): UNPATCHED_MODELS = _load_all_models(patched=False) -@pytest.mark.parametrize( - ("patched", "unpatched"), zip(PATCHED_MODELS, UNPATCHED_MODELS) -) +@pytest.mark.parametrize(("patched", "unpatched"), zip(PATCHED_MODELS, UNPATCHED_MODELS)) def test_is_patched_instance(patched, unpatched): assert is_patched_instance(patched), f"{patched} is a patched instance" assert not is_patched_instance(unpatched), f"{unpatched} is an unpatched instance" diff --git a/sklearnex/tests/test_run_to_run_stability_tests.py b/sklearnex/tests/test_run_to_run_stability_tests.py index 91d84ec283..33f39bea79 100755 --- a/sklearnex/tests/test_run_to_run_stability_tests.py +++ b/sklearnex/tests/test_run_to_run_stability_tests.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,31 +12,48 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== + +import random -import daal4py as d4p import numpy as np import pytest -import random +import daal4py as d4p from sklearnex import patch_sklearn + patch_sklearn() -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegression, LogisticRegressionCV -from sklearn.neighbors import (KNeighborsClassifier, KNeighborsRegressor, - NearestNeighbors, LocalOutlierFactor) -from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso -from sklearn.cluster import KMeans, DBSCAN +from scipy import sparse +from sklearn.cluster import DBSCAN, KMeans +from sklearn.datasets import ( + load_breast_cancer, + load_diabetes, + load_iris, + make_classification, + make_regression, +) from sklearn.decomposition import PCA -from sklearn.svm import SVC, NuSVC, SVR, NuSVR +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import ( + ElasticNet, + Lasso, + LinearRegression, + LogisticRegression, + LogisticRegressionCV, + Ridge, +) from sklearn.manifold import TSNE +from sklearn.metrics import pairwise_distances, roc_auc_score from sklearn.model_selection import train_test_split +from sklearn.neighbors import ( + KNeighborsClassifier, + KNeighborsRegressor, + LocalOutlierFactor, + NearestNeighbors, +) +from sklearn.svm import SVC, SVR, NuSVC, NuSVR -from sklearn.datasets import (make_classification, make_regression, - load_breast_cancer, load_diabetes, load_iris) -from sklearn.metrics import pairwise_distances, roc_auc_score -from scipy import sparse from daal4py.sklearn._utils import daal_check_version # to reproduce errors even in CI @@ -51,40 +68,40 @@ def method_processing(X, clf, methods): res = [] name = [] for i in methods: - if i == 'predict': + if i == "predict": res.append(clf.predict(X)) - name.append(get_class_name(clf) + '.predict(X)') - elif i == 'predict_proba': + name.append(get_class_name(clf) + ".predict(X)") + elif i == "predict_proba": res.append(clf.predict_proba(X)) - name.append(get_class_name(clf) + '.predict_proba(X)') - elif i == 'decision_function': + name.append(get_class_name(clf) + ".predict_proba(X)") + elif i == "decision_function": res.append(clf.decision_function(X)) - name.append(get_class_name(clf) + '.decision_function(X)') - elif i == 'kneighbors': + name.append(get_class_name(clf) + ".decision_function(X)") + elif i == "kneighbors": dist, idx = clf.kneighbors(X) res.append(dist) - name.append('dist') + name.append("dist") res.append(idx) - name.append('idx') - elif i == 'fit_predict': + name.append("idx") + elif i == "fit_predict": predict = clf.fit_predict(X) res.append(predict) - name.append(get_class_name(clf) + '.fit_predict') - elif i == 'fit_transform': + name.append(get_class_name(clf) + ".fit_predict") + elif i == "fit_transform": res.append(clf.fit_transform(X)) - name.append(get_class_name(clf) + '.fit_transform') - elif i == 'transform': + name.append(get_class_name(clf) + ".fit_transform") + elif i == "transform": res.append(clf.transform(X)) - name.append(get_class_name(clf) + '.transform(X)') - elif i == 'get_covariance': + name.append(get_class_name(clf) + ".transform(X)") + elif i == "get_covariance": res.append(clf.get_covariance()) - name.append(get_class_name(clf) + '.get_covariance()') - elif i == 'get_precision': + name.append(get_class_name(clf) + ".get_covariance()") + elif i == "get_precision": res.append(clf.get_precision()) - name.append(get_class_name(clf) + '.get_precision()') - elif i == 'score_samples': + name.append(get_class_name(clf) + ".get_precision()") + elif i == "score_samples": res.append(clf.score_samples(X)) - name.append(get_class_name(clf) + '.score_samples(X)') + name.append(get_class_name(clf) + ".score_samples(X)") return res, name @@ -98,29 +115,30 @@ def func(X, Y, clf, methods): if isinstance(ans, np.ndarray) and None in ans: continue res.append(ans) - name.append(get_class_name(clf) + '.' + i) + name.append(get_class_name(clf) + "." + i) return res, name def _run_test(model, methods, dataset): datasets = [] - if dataset in ['blobs', 'classifier', 'sparse']: + if dataset in ["blobs", "classifier", "sparse"]: X1, y1 = load_iris(return_X_y=True) - if dataset == 'sparse': + if dataset == "sparse": X1 = sparse.csr_matrix(X1) datasets.append((X1, y1)) X2, y2 = load_breast_cancer(return_X_y=True) - if dataset == 'sparse': + if dataset == "sparse": X2 = sparse.csr_matrix(X2) datasets.append((X2, y2)) - elif dataset == 'regression': - X1, y1 = make_regression(n_samples=500, n_features=10, - noise=64.0, random_state=42) + elif dataset == "regression": + X1, y1 = make_regression( + n_samples=500, n_features=10, noise=64.0, random_state=42 + ) datasets.append((X1, y1)) X2, y2 = load_diabetes(return_X_y=True) datasets.append((X2, y2)) else: - raise ValueError('Unknown dataset type') + raise ValueError("Unknown dataset type") for X, y in datasets: baseline, name = func(X, y, model, methods) @@ -128,239 +146,264 @@ def _run_test(model, methods, dataset): res, _ = func(X, y, model, methods) for a, b, n in zip(res, baseline, name): - np.testing.assert_allclose(a, b, rtol=0.0, atol=0.0, - err_msg=str(n + " is incorrect")) + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str(n + " is incorrect") + ) MODELS_INFO = [ { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='brute', - weights="uniform"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="brute", weights="uniform" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='brute', - weights="distance"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="brute", weights="distance" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree', - weights="uniform"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="kd_tree", weights="uniform" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree', - weights="distance"), - 'methods': ['predict', 'predict_proba', 'kneighbors'], - 'dataset': 'classifier', + "model": KNeighborsClassifier( + n_neighbors=10, algorithm="kd_tree", weights="distance" + ), + "methods": ["predict", "predict_proba", "kneighbors"], + "dataset": "classifier", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='kd_tree', - weights="distance"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="kd_tree", weights="distance" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='kd_tree', - weights="uniform"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="kd_tree", weights="uniform" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='brute', - weights="distance"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="brute", weights="distance" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': KNeighborsRegressor(n_neighbors=10, algorithm='brute', - weights="uniform"), - 'methods': ['predict', 'kneighbors'], - 'dataset': 'regression', + "model": KNeighborsRegressor( + n_neighbors=10, algorithm="brute", weights="uniform" + ), + "methods": ["predict", "kneighbors"], + "dataset": "regression", }, { - 'model': NearestNeighbors(n_neighbors=10, algorithm='brute'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(n_neighbors=10, algorithm="brute"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': NearestNeighbors(n_neighbors=10, algorithm='kd_tree'), - 'methods': ['kneighbors'], - 'dataset': 'blobs', + "model": NearestNeighbors(n_neighbors=10, algorithm="kd_tree"), + "methods": ["kneighbors"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(n_neighbors=10, novelty=False), - 'methods': ['fit_predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(n_neighbors=10, novelty=False), + "methods": ["fit_predict"], + "dataset": "blobs", }, { - 'model': LocalOutlierFactor(n_neighbors=10, novelty=True), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": LocalOutlierFactor(n_neighbors=10, novelty=True), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': DBSCAN(algorithm="brute", n_jobs=-1), - 'methods': [], - 'dataset': 'blobs', + "model": DBSCAN(algorithm="brute", n_jobs=-1), + "methods": [], + "dataset": "blobs", }, { - 'model': SVC(kernel='rbf'), - 'methods': ['predict', 'decision_function'], - 'dataset': 'classifier', + "model": SVC(kernel="rbf"), + "methods": ["predict", "decision_function"], + "dataset": "classifier", }, { - 'model': SVC(kernel='rbf'), - 'methods': ['predict', 'decision_function'], - 'dataset': 'sparse', + "model": SVC(kernel="rbf"), + "methods": ["predict", "decision_function"], + "dataset": "sparse", }, { - 'model': NuSVC(kernel='rbf'), - 'methods': ['predict', 'decision_function'], - 'dataset': 'classifier', + "model": NuSVC(kernel="rbf"), + "methods": ["predict", "decision_function"], + "dataset": "classifier", }, { - 'model': SVR(kernel='rbf'), - 'methods': ['predict'], - 'dataset': 'regression', + "model": SVR(kernel="rbf"), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': NuSVR(kernel='rbf'), - 'methods': ['predict'], - 'dataset': 'regression', + "model": NuSVR(kernel="rbf"), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': TSNE(random_state=0), - 'methods': ['fit_transform'], - 'dataset': 'classifier', + "model": TSNE(random_state=0), + "methods": ["fit_transform"], + "dataset": "classifier", }, { - 'model': KMeans(random_state=0, init="k-means++"), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": KMeans(random_state=0, init="k-means++"), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': KMeans(random_state=0, init="random"), - 'methods': ['predict'], - 'dataset': 'blobs', + "model": KMeans(random_state=0, init="random"), + "methods": ["predict"], + "dataset": "blobs", }, { - 'model': KMeans(random_state=0, init="k-means++"), - 'methods': ['predict'], - 'dataset': 'sparse', + "model": KMeans(random_state=0, init="k-means++"), + "methods": ["predict"], + "dataset": "sparse", }, { - 'model': KMeans(random_state=0, init="random"), - 'methods': ['predict'], - 'dataset': 'sparse', + "model": KMeans(random_state=0, init="random"), + "methods": ["predict"], + "dataset": "sparse", }, { - 'model': ElasticNet(random_state=0), - 'methods': ['predict'], - 'dataset': 'regression', + "model": ElasticNet(random_state=0), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': Lasso(random_state=0), - 'methods': ['predict'], - 'dataset': 'regression', + "model": Lasso(random_state=0), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': PCA(n_components=0.5, svd_solver="full", random_state=0), - 'methods': ['transform', 'get_covariance', 'get_precision', 'score_samples'], - 'dataset': 'classifier', + "model": PCA(n_components=0.5, svd_solver="full", random_state=0), + "methods": ["transform", "get_covariance", "get_precision", "score_samples"], + "dataset": "classifier", }, { - 'model': RandomForestClassifier(random_state=0, oob_score=True, - max_samples=0.5, max_features='sqrt'), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": RandomForestClassifier( + random_state=0, oob_score=True, max_samples=0.5, max_features="sqrt" + ), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegression(random_state=0, solver="newton-cg", max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegression(random_state=0, solver="newton-cg", max_iter=1000), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegression(random_state=0, solver="lbfgs", max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegression(random_state=0, solver="lbfgs", max_iter=1000), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(random_state=0, solver="newton-cg", - n_jobs=-1, max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegressionCV( + random_state=0, solver="newton-cg", n_jobs=-1, max_iter=1000 + ), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': LogisticRegressionCV(random_state=0, solver="lbfgs", - n_jobs=-1, max_iter=1000), - 'methods': ['predict', 'predict_proba'], - 'dataset': 'classifier', + "model": LogisticRegressionCV( + random_state=0, solver="lbfgs", n_jobs=-1, max_iter=1000 + ), + "methods": ["predict", "predict_proba"], + "dataset": "classifier", }, { - 'model': RandomForestRegressor(random_state=0, oob_score=True, - max_samples=0.5, max_features='sqrt'), - 'methods': ['predict'], - 'dataset': 'regression', + "model": RandomForestRegressor( + random_state=0, oob_score=True, max_samples=0.5, max_features="sqrt" + ), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': LinearRegression(), - 'methods': ['predict'], - 'dataset': 'regression', + "model": LinearRegression(), + "methods": ["predict"], + "dataset": "regression", }, { - 'model': Ridge(random_state=0), - 'methods': ['predict'], - 'dataset': 'regression', + "model": Ridge(random_state=0), + "methods": ["predict"], + "dataset": "regression", }, ] TO_SKIP = [ - 'TSNE', # Absolute diff is 1e-10, potential problem in KNN, - # will be fixed for next release. (UPD. KNN is fixed but there is a problem - # with stability of stock sklearn. It is already stable in master, so, we - # need to wait for the next sklearn release) - 'LogisticRegression', # Absolute diff is 1e-8, will be fixed for next release - 'LogisticRegressionCV', # Absolute diff is 1e-10, will be fixed for next release - 'RandomForestRegressor', # Absolute diff is 1e-14 in OOB score, - # will be fixed for next release + "TSNE", # Absolute diff is 1e-10, potential problem in KNN, + # will be fixed for next release. (UPD. KNN is fixed but there is a problem + # with stability of stock sklearn. It is already stable in master, so, we + # need to wait for the next sklearn release) + "LogisticRegression", # Absolute diff is 1e-8, will be fixed for next release + "LogisticRegressionCV", # Absolute diff is 1e-10, will be fixed for next release + "RandomForestRegressor", # Absolute diff is 1e-14 in OOB score, + # will be fixed for next release ] -@pytest.mark.parametrize('model_head', MODELS_INFO) +@pytest.mark.parametrize("model_head", MODELS_INFO) def test_models(model_head): stable_algos = [] - if get_class_name(model_head['model']) in stable_algos \ - and daal_check_version((2021, 'P', 300)): + if get_class_name(model_head["model"]) in stable_algos and daal_check_version( + (2021, "P", 300) + ): try: - TO_SKIP.remove(get_class_name(model_head['model'])) + TO_SKIP.remove(get_class_name(model_head["model"])) except ValueError: pass - if get_class_name(model_head['model']) in TO_SKIP: + if get_class_name(model_head["model"]) in TO_SKIP: pytest.skip("Unstable", allow_module_level=False) - _run_test(model_head['model'], model_head['methods'], model_head['dataset']) + _run_test(model_head["model"], model_head["methods"], model_head["dataset"]) -@pytest.mark.parametrize('features', range(5, 10)) +@pytest.mark.parametrize("features", range(5, 10)) def test_train_test_split(features): - X, y = make_classification(n_samples=4000, n_features=features, - n_informative=features, n_redundant=0, - n_clusters_per_class=8, random_state=0) - baseline_X_train, baseline_X_test, baseline_y_train, baseline_y_test = \ - train_test_split(X, y, test_size=0.33, random_state=0) + X, y = make_classification( + n_samples=4000, + n_features=features, + n_informative=features, + n_redundant=0, + n_clusters_per_class=8, + random_state=0, + ) + ( + baseline_X_train, + baseline_X_test, + baseline_y_train, + baseline_y_test, + ) = train_test_split(X, y, test_size=0.33, random_state=0) baseline = [baseline_X_train, baseline_X_test, baseline_y_train, baseline_y_test] for _ in range(10): - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, - random_state=0) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.33, random_state=0 + ) res = [X_train, X_test, y_train, y_test] for a, b in zip(res, baseline): - np.testing.assert_allclose(a, b, rtol=0.0, atol=0.0, - err_msg=str("train_test_split is incorrect")) + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str("train_test_split is incorrect") + ) -@pytest.mark.parametrize('metric', ['cosine', 'correlation']) +@pytest.mark.parametrize("metric", ["cosine", "correlation"]) def test_pairwise_distances(metric): X = np.random.rand(1000) X = np.array(X, dtype=np.float64) @@ -368,16 +411,18 @@ def test_pairwise_distances(metric): for _ in range(5): res = pairwise_distances(X.reshape(1, -1), metric=metric) for a, b in zip(res, baseline): - np.testing.assert_allclose(a, b, rtol=0.0, atol=0.0, - err_msg=str("pairwise_distances is incorrect")) + np.testing.assert_allclose( + a, b, rtol=0.0, atol=0.0, err_msg=str("pairwise_distances is incorrect") + ) -@pytest.mark.parametrize('array_size', [100, 1000, 10000]) +@pytest.mark.parametrize("array_size", [100, 1000, 10000]) def test_roc_auc(array_size): a = [random.randint(0, 1) for i in range(array_size)] b = [random.randint(0, 1) for i in range(array_size)] baseline = roc_auc_score(a, b) for _ in range(5): res = roc_auc_score(a, b) - np.testing.assert_allclose(baseline, res, rtol=0.0, atol=0.0, - err_msg=str("roc_auc is incorrect")) + np.testing.assert_allclose( + baseline, res, rtol=0.0, atol=0.0, err_msg=str("roc_auc is incorrect") + ) diff --git a/sklearnex/tests/utils/_launch_algorithms.py b/sklearnex/tests/utils/_launch_algorithms.py index cc6038eb9e..dfd2cb49fe 100755 --- a/sklearnex/tests/utils/_launch_algorithms.py +++ b/sklearnex/tests/utils/_launch_algorithms.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,25 +12,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -import numpy as np import logging import random +import numpy as np + from sklearnex import patch_sklearn + patch_sklearn() +import pathlib +import sys + +from sklearn.datasets import load_diabetes, load_iris, make_regression from sklearn.metrics import pairwise_distances, roc_auc_score -from sklearn.datasets import ( - make_regression, - load_iris, - load_diabetes) -import sys -import pathlib absolute_path = str(pathlib.Path(__file__).parent.absolute()) -sys.path.append(absolute_path + '/../') +sys.path.append(absolute_path + "/../") from _models_info import MODELS_INFO, TYPES @@ -39,80 +39,80 @@ def get_class_name(x): def generate_dataset(name, dtype, model_name): - if model_name == 'LinearRegression': + if model_name == "LinearRegression": X, y = make_regression(n_samples=1000, n_features=5) - elif name in ['blobs', 'classifier']: + elif name in ["blobs", "classifier"]: X, y = load_iris(return_X_y=True) - elif name == 'regression': + elif name == "regression": X, y = load_diabetes(return_X_y=True) else: - raise ValueError('Unknown dataset type') + raise ValueError("Unknown dataset type") X = np.array(X, dtype=dtype) y = np.array(y, dtype=dtype) return (X, y) def run_patch(model_info, dtype): - print(get_class_name(model_info['model']), dtype.__name__) - X, y = generate_dataset(model_info['dataset'], - dtype, - get_class_name(model_info['model'])) - model = model_info['model'] + print(get_class_name(model_info["model"]), dtype.__name__) + X, y = generate_dataset( + model_info["dataset"], dtype, get_class_name(model_info["model"]) + ) + model = model_info["model"] model.fit(X, y) - logging.info('fit') - for i in model_info['methods']: - if i == 'predict': + logging.info("fit") + for i in model_info["methods"]: + if i == "predict": model.predict(X) - elif i == 'predict_proba': + elif i == "predict_proba": model.predict_proba(X) - elif i == 'predict_log_proba': + elif i == "predict_log_proba": model.predict_log_proba(X) - elif i == 'decision_function': + elif i == "decision_function": model.decision_function(X) - elif i == 'fit_predict': + elif i == "fit_predict": model.fit_predict(X) - elif i == 'transform': + elif i == "transform": model.transform(X) - elif i == 'fit_transform': + elif i == "fit_transform": model.fit_transform(X) - elif i == 'kneighbors': + elif i == "kneighbors": model.kneighbors(X) - elif i == 'score': + elif i == "score": model.score(X, y) else: - raise ValueError(i + ' is wrong method') + raise ValueError(i + " is wrong method") logging.info(i) def run_algotithms(): for info in MODELS_INFO: for t in TYPES: - model_name = get_class_name(info['model']) - if model_name in ['Ridge', 'LinearRegression'] and t.__name__ == 'uint32': + model_name = get_class_name(info["model"]) + if model_name in ["Ridge", "LinearRegression"] and t.__name__ == "uint32": continue run_patch(info, t) def run_utils(): # pairwise_distances - for metric in ['cosine', 'correlation']: + for metric in ["cosine", "correlation"]: for t in TYPES: X = np.random.rand(1000) X = np.array(X, dtype=t) - print('pairwise_distances', t.__name__) + print("pairwise_distances", t.__name__) _ = pairwise_distances(X.reshape(1, -1), metric=metric) - logging.info('pairwise_distances') + logging.info("pairwise_distances") # roc_auc_score for t in [np.float32, np.float64]: a = [random.randint(0, 1) for i in range(1000)] b = [random.randint(0, 1) for i in range(1000)] a = np.array(a, dtype=t) b = np.array(b, dtype=t) - print('roc_auc_score', t.__name__) + print("roc_auc_score", t.__name__) _ = roc_auc_score(a, b) - logging.info('roc_auc_score') + logging.info("roc_auc_score") -if __name__ == '__main__': +if __name__ == "__main__": run_algotithms() run_utils() diff --git a/sklearnex/utils/__init__.py b/sklearnex/utils/__init__.py index eb5355bc4f..4c3fe21154 100755 --- a/sklearnex/utils/__init__.py +++ b/sklearnex/utils/__init__.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from .validation import _assert_all_finite -__all__ = ['_assert_all_finite'] +__all__ = ["_assert_all_finite"] diff --git a/sklearnex/utils/validation.py b/sklearnex/utils/validation.py index 3e75d0fac5..8457e46314 100755 --- a/sklearnex/utils/validation.py +++ b/sklearnex/utils/validation.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn.utils.validation import _assert_all_finite