From 65afadb131dac40cf3fd2c756f3db518ba4620a8 Mon Sep 17 00:00:00 2001 From: olegkkruglov <102592747+olegkkruglov@users.noreply.github.com> Date: Tue, 22 Oct 2024 10:15:08 +0200 Subject: [PATCH] DOC: Add info about incremental algorithms and BS (#2103) (#2112) * DOC: Add info about incremental algorithms and BS (#2103) - Add 'Non-Scikit algorithms' part to the docs - Add info about IncrementalPCA - Add sphinx.napoleon extension to generate docs from docstrings. - Update docstrings for non-scikit algorithms * Remove info about 2025.1 changes from BS and IncBS docstrings --------- Co-authored-by: Samir Nasibli --- doc/sources/algorithms.rst | 3 + doc/sources/conf.py | 1 + doc/sources/index.rst | 1 + doc/sources/non-scikit-algorithms.rst | 44 ++++++++++++++ .../basic_statistics/basic_statistics.py | 35 +++++++++-- .../incremental_basic_statistics.py | 55 +++++++++++++---- .../covariance/incremental_covariance.py | 28 +++++++-- sklearnex/linear_model/incremental_linear.py | 60 ++++++++++++------- 8 files changed, 186 insertions(+), 41 deletions(-) create mode 100644 doc/sources/non-scikit-algorithms.rst diff --git a/doc/sources/algorithms.rst b/doc/sources/algorithms.rst index 6a73ee2b96..d7907e02f7 100755 --- a/doc/sources/algorithms.rst +++ b/doc/sources/algorithms.rst @@ -159,6 +159,9 @@ Dimensionality Reduction - ``svd_solver`` not in [`'full'`, `'covariance_eigh'`] - Sparse data is not supported + * - `IncrementalPCA` + - All parameters are supported + - Sparse data is not supported * - `TSNE` - All parameters are supported except: diff --git a/doc/sources/conf.py b/doc/sources/conf.py index b65842bfda..d40be16012 100755 --- a/doc/sources/conf.py +++ b/doc/sources/conf.py @@ -67,6 +67,7 @@ "notfound.extension", "sphinx_design", "sphinx_copybutton", + "sphinx.ext.napoleon", ] # Add any paths that contain templates here, relative to this directory. diff --git a/doc/sources/index.rst b/doc/sources/index.rst index d6b52ae806..4489f995a3 100755 --- a/doc/sources/index.rst +++ b/doc/sources/index.rst @@ -105,6 +105,7 @@ Enable Intel(R) GPU optimizations algorithms.rst oneAPI and GPU support distributed-mode.rst + non-scikit-algorithms.rst array_api.rst verbose.rst deprecation.rst diff --git a/doc/sources/non-scikit-algorithms.rst b/doc/sources/non-scikit-algorithms.rst new file mode 100644 index 0000000000..620461843f --- /dev/null +++ b/doc/sources/non-scikit-algorithms.rst @@ -0,0 +1,44 @@ +.. ****************************************************************************** +.. * Copyright 2024 Intel Corporation +.. * +.. * Licensed under the Apache License, Version 2.0 (the "License"); +.. * you may not use this file except in compliance with the License. +.. * You may obtain a copy of the License at +.. * +.. * http://www.apache.org/licenses/LICENSE-2.0 +.. * +.. * Unless required by applicable law or agreed to in writing, software +.. * distributed under the License is distributed on an "AS IS" BASIS, +.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. * See the License for the specific language governing permissions and +.. * limitations under the License. +.. *******************************************************************************/ + +Non-Scikit-Learn Algorithms +=========================== +Algorithms not presented in the original scikit-learn are described here. All algorithms are +available for both CPU and GPU (including distributed mode) + +BasicStatistics +--------------- +.. autoclass:: sklearnex.basic_statistics.BasicStatistics +.. automethod:: sklearnex.basic_statistics.BasicStatistics.fit + +IncrementalBasicStatistics +-------------------------- +.. autoclass:: sklearnex.basic_statistics.IncrementalBasicStatistics +.. automethod:: sklearnex.basic_statistics.IncrementalBasicStatistics.fit +.. automethod:: sklearnex.basic_statistics.IncrementalBasicStatistics.partial_fit + +IncrementalEmpiricalCovariance +------------------------------ +.. autoclass:: sklearnex.covariance.IncrementalEmpiricalCovariance +.. automethod:: sklearnex.covariance.IncrementalEmpiricalCovariance.fit +.. automethod:: sklearnex.covariance.IncrementalEmpiricalCovariance.partial_fit + +IncrementalLinearRegression +--------------------------- +.. autoclass:: sklearnex.linear_model.IncrementalLinearRegression +.. automethod:: sklearnex.linear_model.IncrementalLinearRegression.fit +.. automethod:: sklearnex.linear_model.IncrementalLinearRegression.partial_fit +.. automethod:: sklearnex.linear_model.IncrementalLinearRegression.predict \ No newline at end of file diff --git a/sklearnex/basic_statistics/basic_statistics.py b/sklearnex/basic_statistics/basic_statistics.py index 546d52b5b3..f1ab30207a 100644 --- a/sklearnex/basic_statistics/basic_statistics.py +++ b/sklearnex/basic_statistics/basic_statistics.py @@ -32,12 +32,16 @@ class BasicStatistics(BaseEstimator): """ Estimator for basic statistics. Allows to compute basic statistics for provided data. + Parameters ---------- result_options: string or list, default='all' - List of statistics to compute + Used to set statistics to calculate. Possible values are ``'min'``, ``'max'``, ``'sum'``, ``'mean'``, ``'variance'``, + ``'variation'``, ``sum_squares'``, ``sum_squares_centered'``, ``'standard_deviation'``, ``'second_order_raw_moment'`` + or a list containing any of these values. If set to ``'all'`` then all possible statistics will be + calculated. - Attributes (are existing only if corresponding result option exists) + Attributes ---------- min : ndarray of shape (n_features,) Minimum of each feature over all samples. @@ -59,6 +63,27 @@ class BasicStatistics(BaseEstimator): Centered sum of squares for each feature over all samples. second_order_raw_moment : ndarray of shape (n_features,) Second order moment of each feature over all samples. + + Note + ---- + Attribute exists only if corresponding result option has been provided. + + Note + ---- + Some results can exhibit small variations due to + floating point error accumulation and multithreading. + + Examples + -------- + >>> import numpy as np + >>> from sklearnex.basic_statistics import BasicStatistics + >>> bs = BasicStatistics(result_options=['sum', 'min', 'max']) + >>> X = np.array([[1, 2], [3, 4]]) + >>> bs.fit(X) + >>> bs.sum_ + np.array([4., 6.]) + >>> bs.min_ + np.array([1., 2.]) """ def __init__(self, result_options="all"): @@ -113,14 +138,14 @@ def fit(self, X, y=None, *, sample_weight=None): Parameters ---------- X : array-like of shape (n_samples, n_features) - Data for compute, where `n_samples` is the number of samples and - `n_features` is the number of features. + Data for compute, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. y : Ignored Not used, present for API consistency by convention. sample_weight : array-like of shape (n_samples,), default=None - Weights for compute weighted statistics, where `n_samples` is the number of samples. + Weights for compute weighted statistics, where ``n_samples`` is the number of samples. Returns ------- diff --git a/sklearnex/basic_statistics/incremental_basic_statistics.py b/sklearnex/basic_statistics/incremental_basic_statistics.py index 2ffa143421..53318d181b 100644 --- a/sklearnex/basic_statistics/incremental_basic_statistics.py +++ b/sklearnex/basic_statistics/incremental_basic_statistics.py @@ -37,8 +37,10 @@ @control_n_jobs(decorated_methods=["partial_fit", "_onedal_finalize_fit"]) class IncrementalBasicStatistics(BaseEstimator): """ - Incremental estimator for basic statistics. - Allows to compute basic statistics if data are splitted into batches. + Calculates basic statistics on the given data, allows for computation when the data are split into + batches. The user can use ``partial_fit`` method to provide a single batch of data or use the ``fit`` method to provide + the entire dataset. + Parameters ---------- result_options: string or list, default='all' @@ -47,10 +49,9 @@ class IncrementalBasicStatistics(BaseEstimator): batch_size : int, default=None The number of samples to use for each batch. Only used when calling ``fit``. If ``batch_size`` is ``None``, then ``batch_size`` - is inferred from the data and set to ``5 * n_features``, to provide a - balance between approximation accuracy and memory consumption. + is inferred from the data and set to ``5 * n_features``. - Attributes (are existing only if corresponding result option exists) + Attributes ---------- min : ndarray of shape (n_features,) Minimum of each feature over all samples. @@ -81,6 +82,38 @@ class IncrementalBasicStatistics(BaseEstimator): second_order_raw_moment : ndarray of shape (n_features,) Second order moment of each feature over all samples. + + n_samples_seen_ : int + The number of samples processed by the estimator. Will be reset on + new calls to ``fit``, but increments across ``partial_fit`` calls. + + batch_size_ : int + Inferred batch size from ``batch_size``. + + n_features_in_ : int + Number of features seen during ``fit`` or ``partial_fit``. + + Note + ---- + Attribute exists only if corresponding result option has been provided. + + Examples + -------- + >>> import numpy as np + >>> from sklearnex.basic_statistics import IncrementalBasicStatistics + >>> incbs = IncrementalBasicStatistics(batch_size=1) + >>> X = np.array([[1, 2], [3, 4]]) + >>> incbs.partial_fit(X[:1]) + >>> incbs.partial_fit(X[1:]) + >>> incbs.sum_ + np.array([4., 6.]) + >>> incbs.min_ + np.array([1., 2.]) + >>> incbs.fit(X) + >>> incbs.sum_ + np.array([4., 6.]) + >>> incbs.max_ + np.array([3., 4.]) """ _onedal_incremental_basic_statistics = staticmethod(onedal_IncrementalBasicStatistics) @@ -229,14 +262,14 @@ def partial_fit(self, X, sample_weight=None): Parameters ---------- X : array-like of shape (n_samples, n_features) - Data for compute, where `n_samples` is the number of samples and - `n_features` is the number of features. + Data for compute, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. y : Ignored Not used, present for API consistency by convention. sample_weight : array-like of shape (n_samples,), default=None - Weights for compute weighted statistics, where `n_samples` is the number of samples. + Weights for compute weighted statistics, where ``n_samples`` is the number of samples. Returns ------- @@ -261,14 +294,14 @@ def fit(self, X, y=None, sample_weight=None): Parameters ---------- X : array-like of shape (n_samples, n_features) - Data for compute, where `n_samples` is the number of samples and - `n_features` is the number of features. + Data for compute, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. y : Ignored Not used, present for API consistency by convention. sample_weight : array-like of shape (n_samples,), default=None - Weights for compute weighted statistics, where `n_samples` is the number of samples. + Weights for compute weighted statistics, where ``n_samples`` is the number of samples. Returns ------- diff --git a/sklearnex/covariance/incremental_covariance.py b/sklearnex/covariance/incremental_covariance.py index 75c5a01986..7f6c402edd 100644 --- a/sklearnex/covariance/incremental_covariance.py +++ b/sklearnex/covariance/incremental_covariance.py @@ -44,9 +44,9 @@ @control_n_jobs(decorated_methods=["partial_fit", "fit", "_onedal_finalize_fit"]) class IncrementalEmpiricalCovariance(BaseEstimator): """ - Incremental estimator for covariance. - Allows to compute empirical covariance estimated by maximum - likelihood method if data are splitted into batches. + Maximum likelihood covariance estimator that allows for the estimation when the data are split into + batches. The user can use the ``partial_fit`` method to provide a single batch of data or use the ``fit`` method to provide + the entire dataset. Parameters ---------- @@ -79,13 +79,31 @@ class IncrementalEmpiricalCovariance(BaseEstimator): n_samples_seen_ : int The number of samples processed by the estimator. Will be reset on - new calls to fit, but increments across ``partial_fit`` calls. + new calls to ``fit``, but increments across ``partial_fit`` calls. batch_size_ : int Inferred batch size from ``batch_size``. n_features_in_ : int - Number of features seen during :term:`fit` `partial_fit`. + Number of features seen during ``fit`` or ``partial_fit``. + + Examples + -------- + >>> import numpy as np + >>> from sklearnex.covariance import IncrementalEmpiricalCovariance + >>> inccov = IncrementalEmpiricalCovariance(batch_size=1) + >>> X = np.array([[1, 2], [3, 4]]) + >>> inccov.partial_fit(X[:1]) + >>> inccov.partial_fit(X[1:]) + >>> inccov.covariance_ + np.array([[1., 1.],[1., 1.]]) + >>> inccov.location_ + np.array([2., 3.]) + >>> inccov.fit(X) + >>> inccov.covariance_ + np.array([[1., 1.],[1., 1.]]) + >>> inccov.location_ + np.array([2., 3.]) """ _onedal_incremental_covariance = staticmethod(onedal_IncrementalEmpiricalCovariance) diff --git a/sklearnex/linear_model/incremental_linear.py b/sklearnex/linear_model/incremental_linear.py index bce70f13da..6b0cca1cd4 100644 --- a/sklearnex/linear_model/incremental_linear.py +++ b/sklearnex/linear_model/incremental_linear.py @@ -49,8 +49,9 @@ ) class IncrementalLinearRegression(MultiOutputMixin, RegressorMixin, BaseEstimator): """ - Incremental estimator for linear regression. - Allows to train linear regression if data are splitted into batches. + Trains a linear regression model, allows for computation if the data are split into + batches. The user can use the ``partial_fit`` method to provide a single batch of data or use the ``fit`` method to provide + the entire dataset. Parameters ---------- @@ -68,8 +69,7 @@ class IncrementalLinearRegression(MultiOutputMixin, RegressorMixin, BaseEstimato batch_size : int, default=None The number of samples to use for each batch. Only used when calling ``fit``. If ``batch_size`` is ``None``, then ``batch_size`` - is inferred from the data and set to ``5 * n_features``, to provide a - balance between approximation accuracy and memory consumption. + is inferred from the data and set to ``5 * n_features``. Attributes ---------- @@ -83,12 +83,9 @@ class IncrementalLinearRegression(MultiOutputMixin, RegressorMixin, BaseEstimato Independent term in the linear model. Set to 0.0 if `fit_intercept = False`. - n_features_in_ : int - Number of features seen during :term:`fit`. - n_samples_seen_ : int The number of samples processed by the estimator. Will be reset on - new calls to fit, but increments across ``partial_fit`` calls. + new calls to ``fit``, but increments across ``partial_fit`` calls. It should be not less than `n_features_in_` if `fit_intercept` is False and not less than `n_features_in_` + 1 if `fit_intercept` is True to obtain regression coefficients. @@ -97,8 +94,26 @@ class IncrementalLinearRegression(MultiOutputMixin, RegressorMixin, BaseEstimato Inferred batch size from ``batch_size``. n_features_in_ : int - Number of features seen during :term:`fit` `partial_fit`. - + Number of features seen during ``fit`` or ``partial_fit``. + + Examples + -------- + >>> import numpy as np + >>> from sklearnex.linear_model import IncrementalLinearRegression + >>> inclr = IncrementalLinearRegression(batch_size=2) + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 10]]) + >>> y = np.array([1.5, 3.5, 5.5, 8.5]) + >>> inclr.partial_fit(X[:2], y[:2]) + >>> inclr.partial_fit(X[2:], y[2:]) + >>> inclr.coef_ + np.array([0.5., 0.5.]) + >>> inclr.intercept_ + np.array(0.) + >>> inclr.fit(X) + >>> inclr.coef_ + np.array([0.5., 0.5.]) + >>> inclr.intercept_ + np.array(0.) """ _onedal_incremental_linear = staticmethod(onedal_IncrementalLinearRegression) @@ -311,12 +326,12 @@ def partial_fit(self, X, y, check_input=True): Parameters ---------- X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples and + Training data, where ``n_samples`` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values, where `n_samples` is the number of samples and - `n_targets` is the number of targets. + Target values, where ``n_samples`` is the number of samples and + ``n_targets`` is the number of targets. Returns ------- @@ -339,20 +354,20 @@ def partial_fit(self, X, y, check_input=True): def fit(self, X, y): """ - Fit the model with X and y, using minibatches of size batch_size. + Fit the model with X and y, using minibatches of size ``batch_size``. Parameters ---------- X : array-like of shape (n_samples, n_features) - Training data, where `n_samples` is the number of samples and - `n_features` is the number of features. It is necessary for - `n_samples` to be not less than `n_features` if `fit_intercept` - is False and not less than `n_features` + 1 if `fit_intercept` + Training data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. It is necessary for + ``n_samples`` to be not less than ``n_features`` if ``fit_intercept`` + is False and not less than ``n_features + 1`` if ``fit_intercept`` is True y : array-like of shape (n_samples,) or (n_samples, n_targets) - Target values, where `n_samples` is the number of samples and - `n_targets` is the number of targets. + Target values, where ``n_samples`` is the number of samples and + ``n_targets`` is the number of targets. Returns ------- @@ -376,10 +391,15 @@ def fit(self, X, y): def predict(self, X, y=None): """ Predict using the linear model. + Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Samples. + + y : Ignored + Not used, present for API consistency by convention. + Returns ------- C : array, shape (n_samples, n_targets)