diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index fb03f51f53..f62c827c9e 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -45,7 +45,7 @@ steps: . /usr/share/miniconda/etc/profile.d/conda.sh conda activate CB bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) - pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt + pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) if [ $(echo $(PYTHON_VERSION) | grep '3.8\|3.9\|3.10') ]; then conda install -q -y -c intel dpnp; fi pip list diff --git a/.ci/pipeline/build-and-test-mac.yml b/.ci/pipeline/build-and-test-mac.yml index c9f6d05345..0df12bc5d0 100644 --- a/.ci/pipeline/build-and-test-mac.yml +++ b/.ci/pipeline/build-and-test-mac.yml @@ -40,7 +40,7 @@ steps: - script: | source activate CB bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) - pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt + pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) pip list displayName: 'Install testing requirements' diff --git a/.ci/pipeline/build-and-test-win.yml b/.ci/pipeline/build-and-test-win.yml index 1bf9d2e365..2875513cb3 100644 --- a/.ci/pipeline/build-and-test-win.yml +++ b/.ci/pipeline/build-and-test-win.yml @@ -43,7 +43,7 @@ steps: set PATH=C:\msys64\usr\bin;%PATH% call activate CB bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) - pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt + pip install --upgrade -r requirements-test.txt cd .. for /f "delims=" %%c in ('python s\.ci\scripts\get_compatible_scipy_version.py') do set SCIPY_VERSION=%%c pip install %SCIPY_VERSION% diff --git a/.ci/pipeline/nightly.yml b/.ci/pipeline/nightly.yml index d6ea8393e4..7c3e707cfe 100644 --- a/.ci/pipeline/nightly.yml +++ b/.ci/pipeline/nightly.yml @@ -64,7 +64,7 @@ jobs: conda activate CB pip install -r dependencies-dev pip install -r requirements-doc.txt - pip install -r requirements-test.txt -r requirements-test-optional.txt + pip install -r requirements-test.txt pip install jupyter matplotlib requests displayName: 'Install requirements' - script: | diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ceb64309c4..53dc619061 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -13,17 +13,16 @@ requirements-doc.txt @maria-Petrova @napetrov @aepanchi @Alexsandruss onedal/ @Alexsandruss @samir-nasibli @KulikovNikita sklearnex/ @Alexsandruss @samir-nasibli @KulikovNikita -# Examples +# Examples examples/ @maria-Petrova @Alexsandruss @samir-nasibli @napetrov # Dependencies setup.py @napetrov @Alexsandruss @samir-nasibli requirements* @napetrov @Alexsandruss @samir-nasibli @homksei @ahuber21 @ethanglaser -conda-recipe/ @napetrov @Alexsandruss +conda-recipe/ @napetrov @Alexsandruss # Model builders *model_builders* @razdoburdin @ahuber21 @avolkov-intel -requirements-test-optional.txt @razdoburdin @ahuber21 @avolkov-intel # Forests *ensemble* @ahuber21 @icfaust diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index ce7f82b2e3..bad2eddf19 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -200,7 +200,9 @@ def _predict_classification(self, X, fptype, resultsToEvaluate): else: return predict_result.probabilities - def _predict_regression(self, X, fptype): + def _predict_regression( + self, X, fptype, pred_contribs=False, pred_interactions=False + ): if X.shape[1] != self.n_features_in_: raise ValueError("Shape of input is different from what was seen in `fit`") @@ -212,22 +214,64 @@ def _predict_regression(self, X, fptype): ).format(type(self).__name__) ) - # Prediction + try: + return self._predict_regression_with_results_to_compute( + X, fptype, pred_contribs, pred_interactions + ) + except TypeError as e: + if "unexpected keyword argument 'resultsToCompute'" in str(e): + if pred_contribs or pred_interactions: + # SHAP values requested, but not supported by this version + raise TypeError( + f"{'pred_contribs' if pred_contribs else 'pred_interactions'} not supported by this version of daalp4y" + ) from e + else: + # unknown type error + raise + + # fallback to calculation without `resultsToCompute` predict_algo = d4p.gbt_regression_prediction(fptype=fptype) predict_result = predict_algo.compute(X, self.daal_model_) - return predict_result.prediction.ravel() + def _predict_regression_with_results_to_compute( + self, X, fptype, pred_contribs=False, pred_interactions=False + ): + """Assume daal4py supports the resultsToCompute kwarg""" + resultsToCompute = "" + if pred_contribs: + resultsToCompute = "shapContributions" + elif pred_interactions: + resultsToCompute = "shapInteractions" + + predict_algo = d4p.gbt_regression_prediction( + fptype=fptype, resultsToCompute=resultsToCompute + ) + predict_result = predict_algo.compute(X, self.daal_model_) + + if pred_contribs: + return predict_result.prediction.ravel().reshape((-1, X.shape[1] + 1)) + elif pred_interactions: + return predict_result.prediction.ravel().reshape( + (-1, X.shape[1] + 1, X.shape[1] + 1) + ) + else: + return predict_result.prediction.ravel() + class GBTDAALModel(GBTDAALBaseModel): def __init__(self): pass - def predict(self, X): + def predict(self, X, pred_contribs=False, pred_interactions=False): fptype = getFPType(X) if self._is_regression: - return self._predict_regression(X, fptype) + return self._predict_regression(X, fptype, pred_contribs, pred_interactions) else: + if pred_contribs or pred_interactions: + raise NotImplementedError( + f"{'pred_contribs' if pred_contribs else 'pred_interactions'} is not implemented for classification models" + ) return self._predict_classification(X, fptype, "computeClassLabels") def predict_proba(self, X): diff --git a/doc/daal4py/model-builders.rst b/doc/daal4py/model-builders.rst index 3d1f9e7b26..4e89b2a849 100644 --- a/doc/daal4py/model-builders.rst +++ b/doc/daal4py/model-builders.rst @@ -24,17 +24,17 @@ Model Builders for the Gradient Boosting Frameworks Introduction ------------------ -Gradient boosting on decision trees is one of the most accurate and efficient -machine learning algorithms for classification and regression. -The most popular implementations of it are: +Gradient boosting on decision trees is one of the most accurate and efficient +machine learning algorithms for classification and regression. +The most popular implementations of it are: * XGBoost* * LightGBM* * CatBoost* daal4py Model Builders deliver the accelerated -models inference of those frameworks. The inference is performed by the oneDAL GBT implementation tuned -for the best performance on the Intel(R) Architecture. +models inference of those frameworks. The inference is performed by the oneDAL GBT implementation tuned +for the best performance on the Intel(R) Architecture. Conversion --------- @@ -61,22 +61,49 @@ CatBoost:: Classification and Regression Inference ---------------------------------------- -The API is the same for classification and regression inference. -Based on the original model passed to the ``convert_model``, ``d4p_prediction`` is either the classification or regression output. - +The API is the same for classification and regression inference. +Based on the original model passed to the ``convert_model()``, ``d4p_prediction`` is either the classification or regression output. + :: - + d4p_prediction = d4p_model.predict(test_data) Here, the ``predict()`` method of ``d4p_model`` is being used to make predictions on the ``test_data`` dataset. -The ``d4p_prediction`` variable stores the predictions made by the ``predict()`` method. +The ``d4p_prediction`` variable stores the predictions made by the ``predict()`` method. + +SHAP Value Calculation for Regression Models +------------------------------------------------------------ + +SHAP contribution and interaction value calculation are natively supported by models created with daal4py Model Builders. +For these models, the ``predict()`` method takes additional keyword arguments: + + :: + + d4p_model.predict(test_data, pred_contribs=True) # for SHAP contributions + d4p_model.predict(test_data, pred_interactions=True) # for SHAP interactions + +The returned prediction has the shape: + + * ``(n_rows, n_features + 1)`` for SHAP contributions + * ``(n_rows, n_features + 1, n_features + 1)`` for SHAP interactions +Here, ``n_rows`` is the number of rows (i.e., observations) in +``test_data``, and ``n_features`` is the number of features in the dataset. + +The prediction result for SHAP contributions includes a feature attribution value for each feature and a bias term for each observation. + +The prediction result for SHAP interactions comprises ``(n_features + 1) x (n_features + 1)`` values for all possible +feature combinations, along with their corresponding bias terms. + +.. note:: The shapes of SHAP contributions and interactions are consistent with the XGBoost results. + In contrast, the `SHAP Python package `_ drops bias terms, resulting + in SHAP contributions (SHAP interactions) with one fewer column (one fewer column and row) per observation. Scikit-learn-style Estimators -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can also use the scikit-learn-style classes ``GBTDAALClassifier`` and ``GBTDAALRegressor`` to convert and infer your models. For example: -:: +:: from daal4py.sklearn.ensemble import GBTDAALRegressor reg = xgb.XGBRegressor() @@ -88,9 +115,9 @@ Limitations ------------------ Model Builders support only base inference with prediction and probabilities prediction. The functionality is to be extended. Therefore, there are the following limitations: -- The categorical features are not supported for conversion and prediction. +- The categorical features are not supported for conversion and prediction. - The multioutput models are not supported for conversion and prediction. -- The tree SHAP calculations are not supported. +- SHAP values can be calculated for regression models only. Examples @@ -98,6 +125,7 @@ Examples Model Builders models conversion - `XGBoost model conversion `_ +- `SHAP value prediction from an XGBoost model `_ - `LightGBM model conversion `_ - `CatBoost model conversion `_ diff --git a/examples/daal4py/log_reg_model_builder.py b/examples/mb/log_reg_model_builder.py similarity index 100% rename from examples/daal4py/log_reg_model_builder.py rename to examples/mb/log_reg_model_builder.py diff --git a/examples/daal4py/model_builders_catboost.py b/examples/mb/model_builders_catboost.py similarity index 100% rename from examples/daal4py/model_builders_catboost.py rename to examples/mb/model_builders_catboost.py diff --git a/examples/daal4py/model_builders_lightgbm.py b/examples/mb/model_builders_lightgbm.py similarity index 100% rename from examples/daal4py/model_builders_lightgbm.py rename to examples/mb/model_builders_lightgbm.py diff --git a/examples/daal4py/model_builders_xgboost.py b/examples/mb/model_builders_xgboost.py similarity index 100% rename from examples/daal4py/model_builders_xgboost.py rename to examples/mb/model_builders_xgboost.py diff --git a/examples/mb/model_builders_xgboost_shap.py b/examples/mb/model_builders_xgboost_shap.py new file mode 100644 index 0000000000..7780714fd5 --- /dev/null +++ b/examples/mb/model_builders_xgboost_shap.py @@ -0,0 +1,80 @@ +# ============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# daal4py Gradient Boosting Classification model creation and SHAP value +# prediction example + +import numpy as np +import xgboost as xgb +from sklearn.datasets import make_regression +from sklearn.model_selection import train_test_split + +import daal4py as d4p + + +def main(*ars, **kwargs): + # create data + X, y = make_regression(n_samples=10000, n_features=10, random_state=42) + X_train, X_test, y_train, _ = train_test_split(X, y, random_state=42) + + # train the model + xgb_model = xgb.XGBRegressor( + max_depth=6, n_estimators=100, random_state=42, base_score=0.5 + ) + xgb_model.fit(X_train, y_train) + + # Conversion to daal4py + daal_model = d4p.mb.convert_model(xgb_model.get_booster()) + + # SHAP contributions + daal_contribs = daal_model.predict(X_test, pred_contribs=True) + + # SHAP interactions + daal_interactions = daal_model.predict(X_test, pred_interactions=True) + + # XGBoost reference values + xgb_contribs = xgb_model.get_booster().predict( + xgb.DMatrix(X_test), pred_contribs=True, validate_features=False + ) + xgb_interactions = xgb_model.get_booster().predict( + xgb.DMatrix(X_test), pred_interactions=True, validate_features=False + ) + + return ( + daal_contribs, + daal_interactions, + xgb_contribs, + xgb_interactions, + ) + + +if __name__ == "__main__": + daal_contribs, daal_interactions, xgb_contribs, xgb_interactions = main() + print(f"XGBoost SHAP contributions shape: {xgb_contribs.shape}") + print(f"daal4py SHAP contributions shape: {daal_contribs.shape}") + + print(f"XGBoost SHAP interactions shape: {xgb_interactions.shape}") + print(f"daal4py SHAP interactions shape: {daal_interactions.shape}") + + contribution_rmse = np.sqrt( + np.mean((daal_contribs.reshape(-1, 1) - xgb_contribs.reshape(-1, 1)) ** 2) + ) + print(f"SHAP contributions RMSE: {contribution_rmse:.2e}") + + interaction_rmse = np.sqrt( + np.mean((daal_interactions.reshape(-1, 1) - xgb_interactions.reshape(-1, 1)) ** 2) + ) + print(f"SHAP interactions RMSE: {interaction_rmse:.2e}") diff --git a/generator/parse.py b/generator/parse.py index a0fda84af3..6611c59c94 100644 --- a/generator/parse.py +++ b/generator/parse.py @@ -283,8 +283,14 @@ def parse(self, elem, ctxt): ctxt.enum = False return True regex = ( - r"^\s*(\w+)(?:\s*=\s*((\(int\))?\w(\w|:|\s|\+)*))?" - + r"(\s*,)?\s*((/\*|//).*)?$" + # capture group for value name + r"^\s*(\w+)" + # capture group for value (different possible formats, 123, 0x1, (1 << 5), etc.) + + r"(?:\s*=\s*((\(int\))?(\w|:|\s|\+|\(?\d+\s*<<\s*\d+\)?)*))?" + # comma after the value, plus possible comments + + r"(\s*,)?\s*((/\*|//).*)?" + # EOL + + r"$" ) me = re.match(regex, elem) if me and not me.group(1).startswith("last"): diff --git a/requirements-test-optional.txt b/requirements-test-optional.txt deleted file mode 100644 index 45e2575ef4..0000000000 --- a/requirements-test-optional.txt +++ /dev/null @@ -1,4 +0,0 @@ -xgboost==1.7.6; python_version <= '3.9' -xgboost==2.0.0; python_version >= '3.10' -lightgbm==4.1.0 -catboost==1.2.2; python_version <= '3.11' diff --git a/requirements-test.txt b/requirements-test.txt index 10d61ade83..fc9c0ad4eb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,3 +6,8 @@ scikit-learn==1.2.2 ; python_version == '3.8' scikit-learn==1.3.1 ; python_version >= '3.9' pandas==2.0.1 ; python_version == '3.8' pandas==2.1.1 ; python_version >= '3.9' +xgboost==1.7.6; python_version <= '3.9' +xgboost==2.0.0; python_version >= '3.10' +lightgbm==4.1.0 +catboost==1.2.2; python_version <= '3.11' # FIXME: Add as soon as 3.12 is supported +shap==0.42.1; python_version <= '3.11' # FIXME: Add as soon as 3.12 is supported diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index b6ed202037..c031e983ee 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -1,507 +1,720 @@ -#=============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -import json -import re -from collections import deque -from os import getpid, remove -from time import time -from typing import Any, Deque, Dict, List - - -def get_lightgbm_params(booster): - return booster.dump_model() - -def get_xgboost_params(booster): - return json.loads(booster.save_config()) - -def get_catboost_params(booster): - dump_filename = f"catboost_model_{getpid()}_{time()}" - - # Dump model in file - booster.save_model(dump_filename, 'json') - - # Read json with model - with open(dump_filename) as file: - model_data = json.load(file) - - # Delete dump file - remove(dump_filename) - return model_data - -def get_gbt_model_from_lightgbm(model: Any, lgb_model = None) -> Any: - class Node: - def __init__(self, tree: Dict[str, Any], parent_id: int, position: int): - self.tree = tree - self.parent_id = parent_id - self.position = position - - if lgb_model is None: - lgb_model = get_lightgbm_params(model) - - n_features = lgb_model["max_feature_idx"] + 1 - n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] - n_classes = lgb_model["num_tree_per_iteration"] - - is_regression = False - objective_fun = lgb_model["objective"] - if n_classes > 2: - if "multiclass" not in objective_fun: - raise TypeError( - "multiclass (softmax) objective is only supported for multiclass classification") - elif "binary" in objective_fun: # nClasses == 1 - n_classes = 2 - else: - is_regression = True - - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) - - class_label = 0 - iterations_counter = 0 - for tree in lgb_model["tree_info"]: - if is_regression: - tree_id = mb.create_tree(tree["num_leaves"]*2-1) - else: - tree_id = mb.create_tree(n_nodes=tree["num_leaves"]*2-1, class_label=class_label) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = tree["tree_structure"] - - # root is leaf - if "leaf_value" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) - continue - - # add root - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, feature_index=sub_tree["split_feature"], - feature_value=feat_val, default_left=default_left) - - # create stack - node_stack: List[Node] = [Node(sub_tree["left_child"], parent_id, 0), - Node(sub_tree["right_child"], parent_id, 1)] - - # dfs through it - while node_stack: - sub_tree = node_stack[-1].tree - parent_id = node_stack[-1].parent_id - position = node_stack[-1].position - node_stack.pop() - - # current node is leaf - if "leaf_index" in sub_tree: - mb.add_leaf( - tree_id=tree_id, response=sub_tree["leaf_value"], - parent_id=parent_id, position=position) - continue - - # current node is split - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, - parent_id=parent_id, position=position) - - # append children - node_stack.append(Node(sub_tree["left_child"], parent_id, 0)) - node_stack.append(Node(sub_tree["right_child"], parent_id, 1)) - - return mb.model() - - -def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: - class Node: - def __init__(self, tree: Dict, parent_id: int, position: int): - self.tree = tree - self.parent_id = parent_id - self.position = position - - # Release Note for XGBoost 1.5.0: Python interface now supports configuring - # constraints using feature names instead of feature indices. This also - # helps with pandas input with set feature names. - lst = [*range(booster.num_features())] - booster.feature_names = [str(i) for i in lst] - - trees_arr = booster.get_dump(dump_format="json") - if xgb_config is None: - xgb_config = get_xgboost_params(booster) - - - n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) - n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) - base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) - - is_regression = False - objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] - if n_classes > 2: - if objective_fun not in ["multi:softprob", "multi:softmax"]: - raise TypeError( - "multi:softprob and multi:softmax are only supported for multiclass classification") - elif objective_fun.find("binary:") == 0: - if objective_fun in ["binary:logistic", "binary:logitraw"]: - n_classes = 2 - else: - raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification") - else: - is_regression = True - - if hasattr(booster, "best_iteration"): - n_iterations = booster.best_iteration + 1 - trees_arr = trees_arr[: n_iterations * (n_classes if n_classes > 2 else 1)] - else: - n_iterations = int(len(trees_arr) / (n_classes if n_classes > 2 else 1)) - - # Create + base iteration - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) - - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) - - class_label = 0 - iterations_counter = 0 - mis_eq_yes = None - for tree in trees_arr: - n_nodes = 1 - # find out the number of nodes in the tree - for node in tree.split("nodeid")[1:]: - node_id = int(node[3:node.find(",")]) - if node_id + 1 > n_nodes: - n_nodes = node_id + 1 - if is_regression: - tree_id = mb.create_tree(n_nodes) - else: - tree_id = mb.create_tree(n_nodes=n_nodes, class_label=class_label) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = json.loads(tree) - - # root is leaf - if "leaf" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf"]) - continue - - # add root - try: - feature_index = int(sub_tree["split"]) - except ValueError: - raise TypeError("Feature names must be integers") - feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) - default_left = int(sub_tree["yes"] == sub_tree["missing"]) - parent_id = mb.add_split(tree_id=tree_id, feature_index=feature_index, - feature_value=feature_value, default_left=default_left) - - # create queue - node_queue: Deque[Node] = deque() - node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) - node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) - - # bfs through it - while node_queue: - sub_tree = node_queue[0].tree - parent_id = node_queue[0].parent_id - position = node_queue[0].position - node_queue.popleft() - - # current node is leaf - if "leaf" in sub_tree: - mb.add_leaf( - tree_id=tree_id, response=sub_tree["leaf"], - parent_id=parent_id, position=position) - continue - - # current node is split - try: - feature_index = int(sub_tree["split"]) - except ValueError: - raise TypeError("Feature names must be integers") - feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) - default_left = int(sub_tree["yes"] == sub_tree["missing"]) - - parent_id = mb.add_split( - tree_id=tree_id, feature_index=feature_index, feature_value=feature_value, - default_left=default_left, parent_id=parent_id, position=position) - - # append to queue - node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) - node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) - - return mb.model() - -def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: - if not model.is_fitted(): - raise RuntimeError( - "Model should be fitted before exporting to daal4py.") - - if model_data is None: - model_data = get_catboost_params(model) - - if 'categorical_features' in model_data['features_info']: - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") - - n_features = len(model_data['features_info']['float_features']) - - is_symmetric_tree = model_data['model_info']['params']['tree_learner_options']['grow_policy'] == 'SymmetricTree' - - if is_symmetric_tree: - n_iterations = len(model_data['oblivious_trees']) - else: - n_iterations = len(model_data['trees']) - - n_classes = 0 - - if 'class_params' in model_data['model_info']: - is_classification = True - n_classes = len(model_data['model_info'] - ['class_params']['class_to_label']) - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) - else: - is_classification = False - mb = gbt_reg_model_builder(n_features, n_iterations) - - splits = [] - - # Create splits array (all splits are placed sequentially) - for feature in model_data['features_info']['float_features']: - if feature['borders']: - for feature_border in feature['borders']: - splits.append( - {'feature_index': feature['feature_index'], 'value': feature_border}) - - if not is_classification: - bias = model_data['scale_and_bias'][1][0] / n_iterations - scale = model_data['scale_and_bias'][0] - else: - bias = 0 - scale = 1 - - trees_explicit = [] - tree_symmetric = [] - - if model_data['model_info']['params']['data_processing_options']['float_features_binarization']['nan_mode'] == 'Min': - default_left = 1 - else: - default_left = 0 - - for tree_num in range(n_iterations): - if is_symmetric_tree: - - if model_data['oblivious_trees'][tree_num]['splits'] is not None: - # Tree has more than 1 node - cur_tree_depth = len( - model_data['oblivious_trees'][tree_num]['splits']) - else: - cur_tree_depth = 0 - - tree_symmetric.append( - (model_data['oblivious_trees'][tree_num], cur_tree_depth)) - else: - class Node: - def __init__(self, parent=None, split=None, value=None) -> None: - self.right = None - self.left = None - self.split = split - self.value = value - - n_nodes = 1 - # Check if node is a leaf (in case of stump) - if 'split' in model_data['trees'][tree_num]: - # Get number of trees and splits info via BFS - # Create queue - nodes_queue = [] - root_node = Node( - split=splits[model_data['trees'][tree_num]['split']['split_index']]) - nodes_queue.append((model_data['trees'][tree_num], root_node)) - while nodes_queue: - cur_node_data, cur_node = nodes_queue.pop(0) - if 'value' in cur_node_data: - if isinstance(cur_node_data['value'], list): - cur_node.value = [ - value for value in cur_node_data['value']] - else: - cur_node.value = [ - cur_node_data['value'] * scale + bias] - else: - cur_node.split = splits[cur_node_data['split'] - ['split_index']] - left_node = Node() - right_node = Node() - cur_node.left = left_node - cur_node.right = right_node - nodes_queue.append((cur_node_data['left'], left_node)) - nodes_queue.append( - (cur_node_data['right'], right_node)) - n_nodes += 2 - else: - root_node = Node() - if is_classification and n_classes > 2: - root_node.value = [ - value * scale for value in model_data['trees'][tree_num]['value']] - else: - root_node.value = [model_data['trees'][tree_num]['value'] * scale + bias] - trees_explicit.append((root_node, n_nodes)) - - tree_id = [] - class_label = 0 - count = 0 - - # Only 1 tree for each iteration in case of regression or binary classification - if not is_classification or n_classes == 2: - n_tree_each_iter = 1 - else: - n_tree_each_iter = n_classes - - # Create id for trees (for the right order in modelbuilder) - for i in range(n_iterations): - for c in range(n_tree_each_iter): - if is_symmetric_tree: - n_nodes = 2**(tree_symmetric[i][1] + 1) - 1 - else: - n_nodes = trees_explicit[i][1] - - if is_classification and n_classes > 2: - tree_id.append(mb.create_tree(n_nodes, class_label)) - count += 1 - if count == n_iterations: - class_label += 1 - count = 0 - - elif is_classification: - tree_id.append(mb.create_tree(n_nodes, 0)) - else: - tree_id.append(mb.create_tree(n_nodes)) - - - if is_symmetric_tree: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - cur_tree_info = tree_symmetric[i][0] - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - cur_tree_leaf_val = cur_tree_info['leaf_values'] - cur_tree_depth = tree_symmetric[i][1] - - if cur_tree_depth == 0: - mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) - else: - # One split used for the whole level - cur_level_split = splits[cur_tree_info['splits'] - [cur_tree_depth - 1]['split_index']] - root_id = mb.add_split( - tree_id=cur_tree_id, feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) - prev_level_nodes = [root_id] - - # Iterate over levels, splits in json are reversed (root split is the last) - for cur_level in range(cur_tree_depth - 2, -1, -1): - cur_level_nodes = [] - for cur_parent in prev_level_nodes: - cur_level_split = splits[cur_tree_info['splits'] - [cur_level]['split_index']] - cur_left_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=0, - feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) - cur_right_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=1, - feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) - cur_level_nodes.append(cur_left_node) - cur_level_nodes.append(cur_right_node) - prev_level_nodes = cur_level_nodes - - # Different storing format for leaves - if not is_classification or n_classes == 2: - for last_level_node_num in range(len(prev_level_nodes)): - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num] - * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num + 1] - * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) - else: - for last_level_node_num in range(len(prev_level_nodes)): - left_index = 2 * last_level_node_num * n_tree_each_iter + class_label - right_index = (2 * last_level_node_num + 1) * \ - n_tree_each_iter + class_label - mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[left_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) - mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[right_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) - else: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - root_node = trees_explicit[i][0] - - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - # Traverse tree via BFS and build tree with modelbuilder - if root_node.value is None: - root_id = mb.add_split( - tree_id=cur_tree_id, feature_index=root_node.split['feature_index'], feature_value=root_node.split['value'], - default_left=default_left) - nodes_queue = [(root_node, root_id)] - while nodes_queue: - cur_node, cur_node_id = nodes_queue.pop(0) - left_node = cur_node.left - # Check if node is a leaf - if left_node.value is None: - left_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=0, - feature_index=left_node.split['feature_index'], feature_value=left_node.split['value'], - default_left=default_left) - nodes_queue.append((left_node, left_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, response=left_node.value[class_label], parent_id=cur_node_id, position=0) - right_node = cur_node.right - # Check if node is a leaf - if right_node.value is None: - right_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=1, - feature_index=right_node.split['feature_index'], feature_value=right_node.split['value'], - default_left=default_left) - nodes_queue.append((right_node, right_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, response=cur_node.right.value[class_label], - parent_id=cur_node_id, position=1) - - else: - # Tree has only one node - mb.add_leaf(tree_id=cur_tree_id, - response=root_node.value[class_label]) - - return mb.model() +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import json +from collections import deque +from tempfile import NamedTemporaryFile +from typing import Any, Deque, Dict, List, Optional, Tuple +from warnings import warn + +import numpy as np + + +class CatBoostNode: + def __init__( + self, + split: Optional[Dict] = None, + value: Optional[List[float]] = None, + right: Optional[int] = None, + left: Optional[float] = None, + cover: Optional[float] = None, + ) -> None: + self.split = split + self.value = value + self.right = right + self.left = left + self.cover = cover + + +class Node: + """Helper class holding Tree Node information""" + + def __init__( + self, + cover: float, + is_leaf: bool, + default_left: bool, + feature: int, + value: float, + n_children: int = 0, + left_child: "Optional[Node]" = None, + right_child: "Optional[Node]" = None, + parent_id: Optional[int] = -1, + position: Optional[int] = -1, + ) -> None: + self.cover = cover + self.is_leaf = is_leaf + self.default_left = default_left + self.__feature = feature + self.value = value + self.n_children = n_children + self.left_child = left_child + self.right_child = right_child + self.parent_id = parent_id + self.position = position + + @staticmethod + def from_xgb_dict(input_dict: Dict[str, Any]) -> "Node": + if "children" in input_dict: + left_child = Node.from_xgb_dict(input_dict["children"][0]) + right_child = Node.from_xgb_dict(input_dict["children"][1]) + n_children = 2 + left_child.n_children + right_child.n_children + else: + left_child = None + right_child = None + n_children = 0 + is_leaf = "leaf" in input_dict + default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"] + return Node( + cover=input_dict["cover"], + is_leaf=is_leaf, + default_left=default_left, + feature=input_dict.get("split"), + value=input_dict["leaf"] if is_leaf else input_dict["split_condition"], + n_children=n_children, + left_child=left_child, + right_child=right_child, + ) + + @staticmethod + def from_lightgbm_dict(input_dict: Dict[str, Any]) -> "Node": + if "tree_structure" in input_dict: + tree = input_dict["tree_structure"] + else: + tree = input_dict + + n_children = 0 + if "left_child" in tree: + left_child = Node.from_lightgbm_dict(tree["left_child"]) + n_children += 1 + left_child.n_children + else: + left_child = None + if "right_child" in tree: + right_child = Node.from_lightgbm_dict(tree["right_child"]) + n_children += 1 + right_child.n_children + else: + right_child = None + + is_leaf = "leaf_value" in tree + return Node( + cover=tree["leaf_count"] if is_leaf else tree["internal_count"], + is_leaf=is_leaf, + default_left=tree.get("default_left", 0), + feature=tree.get("split_feature"), + value=tree["leaf_value"] if is_leaf else tree["threshold"], + n_children=n_children, + left_child=left_child, + right_child=right_child, + ) + + def get_value_closest_float_downward(self) -> np.float64: + """Get the closest exact fp value smaller than self.value""" + return np.nextafter(np.single(self.value), np.single(-np.inf)) + + def get_children(self) -> "Optional[Tuple[Node, Node]]": + if not self.left_child or not self.right_child: + assert self.is_leaf + else: + return (self.left_child, self.right_child) + + @property + def feature(self) -> int: + if isinstance(self.__feature, int): + return self.__feature + if isinstance(self.__feature, str) and self.__feature.isnumeric(): + return int(self.__feature) + raise ValueError( + f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" + ) + + +class TreeView: + """Helper class, treating a list of nodes as one tree""" + + def __init__(self, tree_id: int, root_node: Node) -> None: + self.tree_id = tree_id + self.root_node = root_node + + @property + def is_leaf(self) -> bool: + return self.root_node.is_leaf + + @property + def value(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + if self.root_node.value is None: + raise ValueError("Tree is leaf-only but leaf node has no value") + return self.root_node.value + + @property + def cover(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + return self.root_node.cover + + @property + def n_nodes(self) -> int: + return self.root_node.n_children + 1 + + +class TreeList(list): + """Helper class that is able to extract all information required by the + model builders from various objects""" + + @staticmethod + def from_xgb_booster(booster, max_trees: int) -> "TreeList": + """ + Load a TreeList from an xgb.Booster object + Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, + therefore not type hint is added. + """ + tl = TreeList() + dump = booster.get_dump(dump_format="json", with_stats=True) + for tree_id, raw_tree in enumerate(dump): + if max_trees > 0 and tree_id == max_trees: + break + raw_tree_parsed = json.loads(raw_tree) + root_node = Node.from_xgb_dict(raw_tree_parsed) + tl.append(TreeView(tree_id=tree_id, root_node=root_node)) + + return tl + + @staticmethod + def from_lightgbm_booster_dump(dump: Dict[str, Any]) -> "TreeList": + """ + Load a TreeList from a lgbm Booster dump + Note: We cannot type-hint the the Model without loading lightgbm as dependency in pyx code, + therefore not type hint is added. + """ + tl = TreeList() + for tree_id, tree_dict in enumerate(dump["tree_info"]): + root_node = Node.from_lightgbm_dict(tree_dict) + tl.append(TreeView(tree_id=tree_id, root_node=root_node)) + + return tl + + def __setitem__(self): + raise NotImplementedError( + "Use TreeList.from_*() methods to initialize a TreeList" + ) + + +def get_lightgbm_params(booster): + return booster.dump_model() + + +def get_xgboost_params(booster): + return json.loads(booster.save_config()) + + +def get_catboost_params(booster): + with NamedTemporaryFile() as fp: + booster.save_model(fp.name, "json") + fp.seek(0) + model_data = json.load(fp) + return model_data + + +def get_gbt_model_from_tree_list( + tree_list: TreeList, + n_iterations: int, + is_regression: bool, + n_features: int, + n_classes: int, + base_score: Optional[float] = None, +): + """Return a GBT Model from TreeList""" + + if is_regression: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + + class_label = 0 + for counter, tree in enumerate(tree_list, start=1): + # find out the number of nodes in the tree + if is_regression: + tree_id = mb.create_tree(tree.n_nodes) + else: + tree_id = mb.create_tree(n_nodes=tree.n_nodes, class_label=class_label) + + if counter % n_iterations == 0: + class_label += 1 + + if tree.is_leaf: + mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) + continue + + root_node = tree.root_node + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=root_node.feature, + feature_value=root_node.get_value_closest_float_downward(), + cover=root_node.cover, + default_left=root_node.default_left, + ) + + # create queue + node_queue: Deque[Node] = deque() + children = root_node.get_children() + assert children is not None + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + while node_queue: + node = node_queue.popleft() + assert node.parent_id != -1, "node.parent_id must not be -1" + assert node.position != -1, "node.position must not be -1" + + if node.is_leaf: + mb.add_leaf( + tree_id=tree_id, + response=node.value, + cover=node.cover, + parent_id=node.parent_id, + position=node.position, + ) + else: + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=node.feature, + feature_value=node.get_value_closest_float_downward(), + cover=node.cover, + default_left=node.default_left, + parent_id=node.parent_id, + position=node.position, + ) + + children = node.get_children() + assert children is not None + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + return mb.model(base_score=base_score) + + +def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any: + if booster is None: + booster = model.dump_model() + + n_features = booster["max_feature_idx"] + 1 + n_iterations = len(booster["tree_info"]) / booster["num_tree_per_iteration"] + n_classes = booster["num_tree_per_iteration"] + + is_regression = False + objective_fun = booster["objective"] + if n_classes > 2: + if "multiclass" not in objective_fun: + raise TypeError( + "multiclass (softmax) objective is only supported for multiclass classification" + ) + elif "binary" in objective_fun: # nClasses == 1 + n_classes = 2 + else: + is_regression = True + + tree_list = TreeList.from_lightgbm_booster_dump(booster) + + return get_gbt_model_from_tree_list( + tree_list, + n_iterations=n_iterations, + is_regression=is_regression, + n_features=n_features, + n_classes=n_classes, + ) + + +def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: + # Release Note for XGBoost 1.5.0: Python interface now supports configuring + # constraints using feature names instead of feature indices. This also + # helps with pandas input with set feature names. + booster.feature_names = [str(i) for i in range(booster.num_features())] + + if xgb_config is None: + xgb_config = get_xgboost_params(booster) + + n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) + n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) + base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) + + is_regression = False + objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] + if n_classes > 2: + if objective_fun not in ["multi:softprob", "multi:softmax"]: + raise TypeError( + "multi:softprob and multi:softmax are only supported for multiclass classification" + ) + elif objective_fun.startswith("binary:"): + if objective_fun not in ["binary:logistic", "binary:logitraw"]: + raise TypeError( + "only binary:logistic and binary:logitraw are supported for binary classification" + ) + n_classes = 2 + if objective_fun == "binary:logitraw": + # daal4py always applies a sigmoid for pred_proba, wheres XGBoost + # returns raw predictions with logitraw + warn( + "objective='binary:logitraw' selected\n" + "XGBoost returns raw class scores when calling pred_proba()\n" + "whilst scikit-learn-intelex always uses binary:logistic\n" + ) + if base_score != 0.5: + warn("objective='binary:logitraw' ignores base_score, fixing base_score to 0.5") + base_score = 0.5 + else: + is_regression = True + + # max_trees=0 if best_iteration does not exist + max_trees = getattr(booster, "best_iteration", -1) + 1 + if n_classes > 2: + max_trees *= n_classes + tree_list = TreeList.from_xgb_booster(booster, max_trees) + + if hasattr(booster, "best_iteration"): + n_iterations = booster.best_iteration + 1 + else: + n_iterations = len(tree_list) // (n_classes if n_classes > 2 else 1) + + return get_gbt_model_from_tree_list( + tree_list, + n_iterations=n_iterations, + is_regression=is_regression, + n_features=n_features, + n_classes=n_classes, + base_score=base_score, + ) + + +def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: + if not model.is_fitted(): + raise RuntimeError("Model should be fitted before exporting to daal4py.") + + if model_data is None: + model_data = get_catboost_params(model) + + if "categorical_features" in model_data["features_info"]: + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + + n_features = len(model_data["features_info"]["float_features"]) + + is_symmetric_tree = ( + model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] + == "SymmetricTree" + ) + + if is_symmetric_tree: + n_iterations = len(model_data["oblivious_trees"]) + else: + n_iterations = len(model_data["trees"]) + + n_classes = 0 + + if "class_params" in model_data["model_info"]: + is_classification = True + n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + else: + is_classification = False + mb = gbt_reg_model_builder(n_features, n_iterations) + + splits = [] + + # Create splits array (all splits are placed sequentially) + for feature in model_data["features_info"]["float_features"]: + if feature["borders"]: + for feature_border in feature["borders"]: + splits.append( + {"feature_index": feature["feature_index"], "value": feature_border} + ) + + if not is_classification: + bias = model_data["scale_and_bias"][1][0] / n_iterations + scale = model_data["scale_and_bias"][0] + else: + bias = 0 + scale = 1 + + trees_explicit = [] + tree_symmetric = [] + + if ( + model_data["model_info"]["params"]["data_processing_options"][ + "float_features_binarization" + ]["nan_mode"] + == "Min" + ): + default_left = 1 + else: + default_left = 0 + + for tree_num in range(n_iterations): + if is_symmetric_tree: + if model_data["oblivious_trees"][tree_num]["splits"] is not None: + # Tree has more than 1 node + cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) + else: + cur_tree_depth = 0 + + tree_symmetric.append( + (model_data["oblivious_trees"][tree_num], cur_tree_depth) + ) + else: + n_nodes = 1 + # Check if node is a leaf (in case of stump) + if "split" in model_data["trees"][tree_num]: + # Get number of trees and splits info via BFS + # Create queue + nodes_queue = [] + root_node = CatBoostNode( + split=splits[model_data["trees"][tree_num]["split"]["split_index"]] + ) + nodes_queue.append((model_data["trees"][tree_num], root_node)) + while nodes_queue: + cur_node_data, cur_node = nodes_queue.pop(0) + if "value" in cur_node_data: + if isinstance(cur_node_data["value"], list): + cur_node.value = [value for value in cur_node_data["value"]] + else: + cur_node.value = [cur_node_data["value"] * scale + bias] + else: + cur_node.split = splits[cur_node_data["split"]["split_index"]] + left_node = CatBoostNode() + right_node = CatBoostNode() + cur_node.left = left_node + cur_node.right = right_node + nodes_queue.append((cur_node_data["left"], left_node)) + nodes_queue.append((cur_node_data["right"], right_node)) + n_nodes += 2 + else: + root_node = CatBoostNode() + if is_classification and n_classes > 2: + root_node.value = [ + value * scale for value in model_data["trees"][tree_num]["value"] + ] + else: + root_node.value = [ + model_data["trees"][tree_num]["value"] * scale + bias + ] + trees_explicit.append((root_node, n_nodes)) + + tree_id = [] + class_label = 0 + count = 0 + + # Only 1 tree for each iteration in case of regression or binary classification + if not is_classification or n_classes == 2: + n_tree_each_iter = 1 + else: + n_tree_each_iter = n_classes + + # Create id for trees (for the right order in modelbuilder) + for i in range(n_iterations): + for c in range(n_tree_each_iter): + if is_symmetric_tree: + n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 + else: + n_nodes = trees_explicit[i][1] + + if is_classification and n_classes > 2: + tree_id.append(mb.create_tree(n_nodes, class_label)) + count += 1 + if count == n_iterations: + class_label += 1 + count = 0 + + elif is_classification: + tree_id.append(mb.create_tree(n_nodes, 0)) + else: + tree_id.append(mb.create_tree(n_nodes)) + + if is_symmetric_tree: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + cur_tree_info = tree_symmetric[i][0] + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + cur_tree_leaf_val = cur_tree_info["leaf_values"] + cur_tree_depth = tree_symmetric[i][1] + + if cur_tree_depth == 0: + mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) + else: + # One split used for the whole level + cur_level_split = splits[ + cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] + ] + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + cover=0.0, + ) + prev_level_nodes = [root_id] + + # Iterate over levels, splits in json are reversed (root split is the last) + for cur_level in range(cur_tree_depth - 2, -1, -1): + cur_level_nodes = [] + for cur_parent in prev_level_nodes: + cur_level_split = splits[ + cur_tree_info["splits"][cur_level]["split_index"] + ] + cur_left_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=0, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + cover=0.0, + ) + cur_right_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=1, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + cover=0.0, + ) + cur_level_nodes.append(cur_left_node) + cur_level_nodes.append(cur_right_node) + prev_level_nodes = cur_level_nodes + + # Different storing format for leaves + if not is_classification or n_classes == 2: + for last_level_node_num in range(len(prev_level_nodes)): + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + cover=0.0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num + 1] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + cover=0.0, + ) + else: + for last_level_node_num in range(len(prev_level_nodes)): + left_index = ( + 2 * last_level_node_num * n_tree_each_iter + class_label + ) + right_index = ( + 2 * last_level_node_num + 1 + ) * n_tree_each_iter + class_label + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[left_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + cover=0.0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[right_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + cover=0.0, + ) + else: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + root_node = trees_explicit[i][0] + + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + # Traverse tree via BFS and build tree with modelbuilder + if root_node.value is None: + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=root_node.split["feature_index"], + feature_value=root_node.split["value"], + default_left=default_left, + cover=0.0, + ) + nodes_queue = [(root_node, root_id)] + while nodes_queue: + cur_node, cur_node_id = nodes_queue.pop(0) + left_node = cur_node.left + # Check if node is a leaf + if left_node.value is None: + left_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=0, + feature_index=left_node.split["feature_index"], + feature_value=left_node.split["value"], + default_left=default_left, + cover=0.0, + ) + nodes_queue.append((left_node, left_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=left_node.value[class_label], + parent_id=cur_node_id, + position=0, + cover=0.0, + ) + right_node = cur_node.right + # Check if node is a leaf + if right_node.value is None: + right_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=1, + feature_index=right_node.split["feature_index"], + feature_value=right_node.split["value"], + default_left=default_left, + cover=0.0, + ) + nodes_queue.append((right_node, right_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_node.right.value[class_label], + parent_id=cur_node_id, + position=1, + cover=0.0, + ) + + else: + # Tree has only one node + mb.add_leaf( + tree_id=cur_tree_id, + response=root_node.value[class_label], + cover=0.0, + ) + + warn("Models converted from CatBoost cannot be used for SHAP value calculation") + return mb.model(0.0) diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index c906a0a537..7a99b07b8c 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -22,10 +22,16 @@ #include #include "onedal/version.hpp" -#if (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) -#define _gbt_inference_has_missing_values_support 1 +#if (((MAJOR_VERSION == 2024) && (MINOR_VERSION == 0) && (UPDATE_VERSION >= 1)) || ((MAJOR_VERSION > 2024) && (MINOR_VERSION >= 1))) + // added missing value support to GBT regression + // added SHAP value support + // added base_score parameter + #define _gbt_inference_api_version 2 +#elif (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) + // added missing value support to GBT classification + #define _gbt_inference_api_version 1 #else -#define _gbt_inference_has_missing_values_support 0 + #define _gbt_inference_api_version 0 #endif typedef daal::algorithms::gbt::classification::ModelBuilder c_gbt_classification_model_builder; @@ -39,32 +45,61 @@ typedef c_gbt_regression_model_builder::TreeId c_gbt_reg_tree_id; #define c_gbt_clf_no_parent c_gbt_classification_model_builder::noParent #define c_gbt_reg_no_parent c_gbt_regression_model_builder::noParent -static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_) +static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_, double base_score) { - return RAW()(obj_->getModel()); + daal::algorithms::gbt::classification::ModelPtr * ptr = RAW()(obj_->getModel()); +#if (_gbt_inference_api_version == 2) + ptr->get()->setPredictionBias(base_score); +#endif + return ptr; } - -static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_builder_model(daal::algorithms::gbt::regression::ModelBuilder * obj_) +static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_builder_model(daal::algorithms::gbt::regression::ModelBuilder * obj_, double base_score) { - return RAW()(obj_->getModel()); + daal::algorithms::gbt::regression::ModelPtr * ptr = RAW()(obj_->getModel()); +#if (_gbt_inference_api_version == 2) + ptr->get()->setPredictionBias(base_score); +#endif + return ptr; } -c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) +c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) { -#if _gbt_inference_has_missing_values_support +#if (_gbt_inference_api_version == 2) + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft, cover); +#elif (_gbt_inference_api_version == 1) return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft); #else return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue); #endif } -c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) +c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) { -#if _gbt_inference_has_missing_values_support +#if (_gbt_inference_api_version == 2) + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft, cover); +#elif (_gbt_inference_api_version == 1) return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft); #else return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue); #endif } +c_gbt_clf_node_id clfAddLeafNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) +{ +#if (_gbt_inference_api_version == 2) + return c_ptr->addLeafNode(treeId, parentId, position, response, cover); +#else + return c_ptr->addLeafNode(treeId, parentId, position, response); +#endif +} + +c_gbt_reg_node_id regAddLeafNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) +{ +#if (_gbt_inference_api_version == 2) + return c_ptr->addLeafNode(treeId, parentId, position, response, cover); +#else + return c_ptr->addLeafNode(treeId, parentId, position, response); +#endif +} + #endif // _GBT_MODEL_BUILDER_INCLUDED_ diff --git a/src/gbt_model_builder.pyx b/src/gbt_model_builder.pyx index f46264ed94..418390a4ec 100644 --- a/src/gbt_model_builder.pyx +++ b/src/gbt_model_builder.pyx @@ -27,21 +27,24 @@ cdef extern from "gbt_model_builder.h": cdef size_t c_gbt_clf_no_parent cdef size_t c_gbt_reg_no_parent - cdef gbt_classification_ModelPtr * get_gbt_classification_model_builder_model(c_gbt_classification_model_builder *) - cdef gbt_regression_ModelPtr * get_gbt_regression_model_builder_model(c_gbt_regression_model_builder *) + cdef gbt_classification_ModelPtr * get_gbt_classification_model_builder_model(c_gbt_classification_model_builder *, double base_score) + cdef gbt_regression_ModelPtr * get_gbt_regression_model_builder_model(c_gbt_regression_model_builder *, double base_score) cdef cppclass c_gbt_classification_model_builder: c_gbt_classification_model_builder(size_t nFeatures, size_t nIterations, size_t nClasses) except + c_gbt_clf_tree_id createTree(size_t nNodes, size_t classLabel) - c_gbt_clf_node_id addLeafNode(c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response) + c_gbt_clf_node_id addLeafNode(c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) cdef cppclass c_gbt_regression_model_builder: c_gbt_regression_model_builder(size_t nFeatures, size_t nIterations) except + c_gbt_reg_tree_id createTree(size_t nNodes) - c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response) + c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover) - cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) - cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) + cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) + cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) + + cdef c_gbt_clf_node_id clfAddLeafNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) + cdef c_gbt_clf_node_id regAddLeafNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover) cdef class gbt_classification_model_builder: ''' @@ -65,7 +68,7 @@ cdef class gbt_classification_model_builder: ''' return self.c_ptr.createTree(n_nodes, class_label) - def add_leaf(self, c_gbt_clf_tree_id tree_id, double response, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): + def add_leaf(self, c_gbt_clf_tree_id tree_id, double response, double cover, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): ''' Create Leaf node and add it to certain tree @@ -73,11 +76,12 @@ cdef class gbt_classification_model_builder: :param node-handle parent_id: parent node to which new node is added (use noParent for root node) :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param double response: response value for leaf node to be predicted + :param double cover: cover (sum_hess) of the leaf node :rtype: node identifier ''' - return self.c_ptr.addLeafNode(tree_id, parent_id, position, response) + return clfAddLeafNodeWrapper(self.c_ptr, tree_id, parent_id, position, response, cover) - def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, int default_left, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): + def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): ''' Create Split node and add it to certain tree. @@ -87,18 +91,20 @@ cdef class gbt_classification_model_builder: :param size_t feature_index: feature index for spliting :param double feature_value: feature value for spliting :param int default_left: default behaviour in case of missing value + :param double cover: cover (sum_hess) of the solit node :rtype: node identifier ''' - return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left) + return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover) - def model(self): + def model(self, base_score): ''' Get built model + :param double base_score: global prediction bias (used e.g. in XGBoost) :rtype: gbt_classification_model ''' cdef gbt_classification_model res = gbt_classification_model.__new__(gbt_classification_model) - res.c_ptr = get_gbt_classification_model_builder_model(self.c_ptr) + res.c_ptr = get_gbt_classification_model_builder_model(self.c_ptr, base_score or 0.0) return res @@ -123,7 +129,7 @@ cdef class gbt_regression_model_builder: ''' return self.c_ptr.createTree(n_nodes) - def add_leaf(self, c_gbt_reg_tree_id tree_id, double response, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): + def add_leaf(self, c_gbt_reg_tree_id tree_id, double response, double cover, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): ''' Create Leaf node and add it to certain tree @@ -131,11 +137,12 @@ cdef class gbt_regression_model_builder: :param node-handle parent_id: parent node to which new node is added (use noParent for root node) :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param double response: response value for leaf node to be predicted + :param double cover: cover (sum_hess) of the leaf node :rtype: node identifier ''' - return self.c_ptr.addLeafNode(tree_id, parent_id, position, response) + return regAddLeafNodeWrapper(self.c_ptr, tree_id, parent_id, position, response, cover) - def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, int default_left, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): + def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): ''' Create Split node and add it to certain tree. @@ -144,19 +151,21 @@ cdef class gbt_regression_model_builder: :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param size_t feature_index: feature index for spliting :param double feature_value: feature value for spliting + :param double cover: cover (sum_hess) of the split node :param int default_left: default behaviour in case of missing value :rtype: node identifier ''' - return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left) + return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover) - def model(self): + def model(self, base_score): ''' Get built model + :param double base_score: global prediction bias (used e.g. in XGBoost) :rtype: gbt_regression_model ''' cdef gbt_regression_model res = gbt_regression_model.__new__(gbt_regression_model) - res.c_ptr = get_gbt_regression_model_builder_model(self.c_ptr) + res.c_ptr = get_gbt_regression_model_builder_model(self.c_ptr, base_score or 0.0) return res diff --git a/tests/test_examples.py b/tests/test_examples.py index a9e8adaf23..c509416148 100755 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -19,9 +19,11 @@ test_path = os.path.abspath(os.path.dirname(__file__)) unittest_data_path = os.path.join(test_path, "unittest_data") -examples_path = os.path.join(os.path.dirname(test_path), "examples", "daal4py") -sys.path.insert(0, examples_path) -os.chdir(examples_path) +daal4py_examples_path = os.path.join(os.path.dirname(test_path), "examples", "daal4py") +mb_examples_path = os.path.join(os.path.dirname(test_path), "examples", "mb") +sys.path.insert(0, daal4py_examples_path) +sys.path.insert(0, mb_examples_path) +os.chdir(daal4py_examples_path) import unittest @@ -270,6 +272,13 @@ def test_svm(self): ((2020, "P", 2), (2021, "B", 109)), ["xgboost"], ), + ( + "model_builders_xgboost_shap", + None, + None, + (2024, "P", 1), + ["xgboost"], + ), ("model_builders_catboost", None, None, (2021, "P", 4), ["catboost"]), ("gradient_boosted_classification",), ("gradient_boosted_regression",), diff --git a/tests/test_logistic_regression_model_builder.py b/tests/test_logistic_regression_model_builder.py deleted file mode 100644 index 3a28677743..0000000000 --- a/tests/test_logistic_regression_model_builder.py +++ /dev/null @@ -1,142 +0,0 @@ -# ============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import unittest - -import numpy as np -from sklearn.datasets import load_breast_cancer, load_iris -from sklearn.linear_model import LogisticRegression - -import daal4py as d4p -from daal4py import _get__daal_link_version__ as dv -from daal4py.sklearn._utils import daal_check_version - -# First item is major version - 2021, -# second is minor+patch - 0110, -# third item is status - B -daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8])) -reason = str(((2021, "P", 1))) + " not supported in this library version " -reason += str(daal_version) - - -class LogRegModelBuilder(unittest.TestCase): - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) - def test_iris_with_intercept(self): - X, y = load_iris(return_X_y=True) - n_classes = 3 - clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit( - X, y - ) - builder = d4p.logistic_regression_model_builder( - n_classes=n_classes, n_features=X.shape[1] - ) - builder.set_beta(clf.coef_, clf.intercept_) - - alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) - - pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() - pred_sklearn = clf.predict(X) - self.assertTrue(np.allclose(pred_daal, pred_sklearn)) - - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) - def test_iris_without_intercept(self): - X, y = load_iris(return_X_y=True) - n_classes = 3 - clf = LogisticRegression(fit_intercept=False, max_iter=1000, random_state=0).fit( - X, y - ) - builder = d4p.logistic_regression_model_builder( - n_classes=n_classes, n_features=X.shape[1] - ) - builder.set_beta(clf.coef_, clf.intercept_) - - alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) - - pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() - pred_sklearn = clf.predict(X) - self.assertTrue(np.allclose(pred_daal, pred_sklearn)) - - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) - def test_breast_cancer_with_intercept(self): - X, y = load_breast_cancer(return_X_y=True) - n_classes = 2 - clf = LogisticRegression(fit_intercept=True, max_iter=10000, random_state=0).fit( - X, y - ) - builder = d4p.logistic_regression_model_builder( - n_classes=n_classes, n_features=X.shape[1] - ) - builder.set_beta(clf.coef_, clf.intercept_) - - alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) - - pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() - pred_sklearn = clf.predict(X) - self.assertTrue(np.allclose(pred_daal, pred_sklearn)) - - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) - def test_breast_cancer_without_intercept(self): - X, y = load_breast_cancer(return_X_y=True) - n_classes = 2 - clf = LogisticRegression(fit_intercept=False, max_iter=10000, random_state=0).fit( - X, y - ) - builder = d4p.logistic_regression_model_builder( - n_classes=n_classes, n_features=X.shape[1] - ) - builder.set_beta(clf.coef_, clf.intercept_) - - alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) - - pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() - pred_sklearn = clf.predict(X) - self.assertTrue(np.allclose(pred_daal, pred_sklearn)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py new file mode 100644 index 0000000000..93b39fd77e --- /dev/null +++ b/tests/test_model_builders.py @@ -0,0 +1,780 @@ +# ============================================================================== +# Copyright 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest + +import lightgbm as lgbm +import numpy as np +import xgboost as xgb +from sklearn.datasets import ( + load_breast_cancer, + load_iris, + make_classification, + make_regression, +) +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split + +import daal4py as d4p +from daal4py.sklearn._utils import daal_check_version + +try: + import catboost as cb + + cb_available = True +except ImportError: + cb_available = False + +try: + import shap + + shap_available = True +except ImportError: + shap_available = False + + +shap_required_version = (2024, "P", 1) +shap_supported = daal_check_version(shap_required_version) +shap_not_supported_str = ( + f"SHAP value calculation only supported for version {shap_required_version} or later" +) +shap_unavailable_str = "SHAP Python package not available" +cb_unavailable_str = "CatBoost not available" + + +class LogRegModelBuilder(unittest.TestCase): + def test_iris_with_intercept(self): + X, y = load_iris(return_X_y=True) + n_classes = 3 + clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit( + X, y + ) + builder = d4p.logistic_regression_model_builder( + n_classes=n_classes, n_features=X.shape[1] + ) + builder.set_beta(clf.coef_, clf.intercept_) + + alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) + + pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() + pred_sklearn = clf.predict(X) + self.assertTrue(np.allclose(pred_daal, pred_sklearn)) + + def test_iris_without_intercept(self): + X, y = load_iris(return_X_y=True) + n_classes = 3 + clf = LogisticRegression(fit_intercept=False, max_iter=1000, random_state=0).fit( + X, y + ) + builder = d4p.logistic_regression_model_builder( + n_classes=n_classes, n_features=X.shape[1] + ) + builder.set_beta(clf.coef_, clf.intercept_) + + alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) + + pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() + pred_sklearn = clf.predict(X) + self.assertTrue(np.allclose(pred_daal, pred_sklearn)) + + def test_breast_cancer_with_intercept(self): + X, y = load_breast_cancer(return_X_y=True) + n_classes = 2 + clf = LogisticRegression(fit_intercept=True, max_iter=10000, random_state=0).fit( + X, y + ) + builder = d4p.logistic_regression_model_builder( + n_classes=n_classes, n_features=X.shape[1] + ) + builder.set_beta(clf.coef_, clf.intercept_) + + alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) + + pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() + pred_sklearn = clf.predict(X) + self.assertTrue(np.allclose(pred_daal, pred_sklearn)) + + def test_breast_cancer_without_intercept(self): + X, y = load_breast_cancer(return_X_y=True) + n_classes = 2 + clf = LogisticRegression(fit_intercept=False, max_iter=10000, random_state=0).fit( + X, y + ) + builder = d4p.logistic_regression_model_builder( + n_classes=n_classes, n_features=X.shape[1] + ) + builder.set_beta(clf.coef_, clf.intercept_) + + alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes) + + pred_daal = alg_pred.compute(X, builder.model).prediction.flatten() + pred_sklearn = clf.predict(X) + self.assertTrue(np.allclose(pred_daal, pred_sklearn)) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostRegressionModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls, base_score=0.5): + X, y = make_regression(n_samples=2, n_features=10, random_state=42) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + cls.xgb_model = xgb.XGBRegressor( + max_depth=5, n_estimators=50, random_state=42, base_score=base_score + ) + cls.xgb_model.fit(X, y) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + # XGBoost treats regression as 0 classes, LightGBM 1 class + # For us, it does not make a difference and both are acceptable + self.assertEqual(m.n_classes_, 0) + self.assertEqual(m.n_features_in_, 10) + self.assertTrue(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X_test) + xgboost_pred = self.xgb_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X_nan) + xgboost_pred = self.xgb_model.predict(self.X_nan) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) + + def test_model_predict_shap_contribs(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + d4p_pred = m.predict(self.X_test, pred_contribs=True) + xgboost_pred = booster.predict( + xgb.DMatrix(self.X_test), + pred_contribs=True, + approx_contribs=False, + validate_features=False, + ) + self.assertTrue( + d4p_pred.shape == xgboost_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {xgboost_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) + + def test_model_predict_shap_interactions(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + d4p_pred = m.predict(self.X_test, pred_interactions=True) + xgboost_pred = booster.predict( + xgb.DMatrix(self.X_test), + pred_interactions=True, + approx_contribs=False, + validate_features=False, + ) + self.assertTrue( + d4p_pred.shape == xgboost_pred.shape, + f"d4p and reference SHAP interaction shape is different {d4p_pred.shape} != {xgboost_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) + + def test_model_predict_shap_contribs_missing_values(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + d4p_pred = m.predict(self.X_nan, pred_contribs=True) + xgboost_pred = booster.predict( + xgb.DMatrix(self.X_nan), + pred_contribs=True, + approx_contribs=False, + validate_features=False, + ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=5e-6) + + +# duplicate all tests for bae_score=0.0 +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostRegressionModelBuilder_base_score0(XGBoostRegressionModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostRegressionModelBuilder.setUpClass(0) + + +# duplicate all tests for bae_score=100 +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostRegressionModelBuilder_base_score100(XGBoostRegressionModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostRegressionModelBuilder.setUpClass(100) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostClassificationModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls, base_score=0.5, n_classes=2, objective="binary:logistic"): + n_features = 15 + cls.base_score = base_score + cls.n_classes = n_classes + X, y = make_classification( + n_samples=500, + n_classes=n_classes, + n_features=n_features, + n_informative=10, + random_state=42, + ) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 2 * n_features, dtype=np.float32).reshape( + 2, n_features + ) + cls.xgb_model = xgb.XGBClassifier( + max_depth=5, + n_estimators=50, + random_state=42, + base_score=base_score, + objective=objective, + ) + cls.xgb_model.fit(X, y) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + self.assertEqual(m.n_classes_, self.n_classes) + self.assertEqual(m.n_features_in_, 15) + self.assertFalse(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X_test) + xgboost_pred = self.xgb_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7) + + def test_model_predict_proba(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict_proba(self.X_test) + xgboost_pred = self.xgb_model.predict_proba(self.X_test) + # calculating probas involves multiple exp / ln operations, therefore + # they're quite susceptible to small numerical changes and we have to + # accept an rtol of 1e-5 + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X_nan) + xgboost_pred = self.xgb_model.predict(self.X_nan) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7) + + def test_model_predict_shap_contribs(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_contribs=True) + + def test_model_predict_shap_interactions(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_contribs=True) + + +# duplicate all tests for bae_score=0.3 +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostClassificationModelBuilder_base_score03(XGBoostClassificationModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(base_score=0.3) + + +# duplicate all tests for bae_score=0.7 +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostClassificationModelBuilder_base_score07(XGBoostClassificationModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(base_score=0.7) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostClassificationModelBuilder_n_classes5(XGBoostClassificationModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(n_classes=5) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostClassificationModelBuilder_n_classes5_base_score03( + XGBoostClassificationModelBuilder +): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(n_classes=5, base_score=0.3) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostClassificationModelBuilder_objective_logitraw( + XGBoostClassificationModelBuilder +): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass( + base_score=0.5, n_classes=2, objective="binary:logitraw" + ) + + def test_model_predict_proba(self): + # overload this function because daal4py always applies the sigmoid + # for bias 0.5, we can still check if the original scores are correct + with self.assertWarns(UserWarning): + # expect a warning that logitraw behaves differently and/or + # that base_score is ignored / fixed to 0.5 + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict_proba(self.X_test) + # undo sigmoid + d4p_pred = np.log(-d4p_pred / (d4p_pred - 1)) + # undo bias + d4p_pred += 0.5 + xgboost_pred = self.xgb_model.predict_proba(self.X_test) + # calculating probas involves multiple exp / ln operations, therefore + # they're quite susceptible to small numerical changes and we have to + # accept an rtol of 1e-5 + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class LightGBMRegressionModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_regression(n_samples=100, n_features=10, random_state=42) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + # LightGBM requires a couple of NaN values in the training data to properly set + # the missing value type to NaN + # https://github.com/microsoft/LightGBM/issues/6139 + X_train = np.concatenate([cls.X_nan, X]) + y_train = np.concatenate([[0, 0], y]) + params = { + "task": "train", + "boosting": "gbdt", + "objective": "regression", + "num_leaves": 4, + "learning_rage": 0.05, + "metric": {"l2", "l1"}, + "verbose": -1, + "n_estimators": 1, + } + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train)) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.lgbm_model) + self.assertEqual(m.n_classes_, 1) + self.assertEqual(m.n_features_in_, 10) + self.assertTrue(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = self.lgbm_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = self.lgbm_model.predict(self.X_nan) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=5e-6) + + @unittest.skipUnless(shap_available, reason=shap_unavailable_str) + def test_model_predict_shap_contribs(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test, pred_contribs=True) + explainer = shap.TreeExplainer(self.lgbm_model) + shap_pred = explainer(self.X_test).values + lgbm_pred = self.lgbm_model.predict(self.X_test, pred_contrib=True) + self.assertTrue( + d4p_pred.shape == lgbm_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred[:, :-1], shap_pred, rtol=1e-6) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) + + @unittest.skipUnless(shap_available, reason=shap_unavailable_str) + def test_model_predict_shap_interactions(self): + m = d4p.mb.convert_model(self.lgbm_model) + # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column + d4p_pred = m.predict(self.X_test, pred_interactions=True)[:, :-1, :-1] + explainer = shap.TreeExplainer(self.lgbm_model) + shap_pred = explainer.shap_interaction_values(self.X_test) + self.assertTrue( + d4p_pred.shape == shap_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {shap_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred, shap_pred, rtol=1e-6) + + def test_model_predict_shap_contribs_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan, pred_contribs=True) + lgbm_pred = self.lgbm_model.predict(self.X_nan, pred_contrib=True) + self.assertTrue( + d4p_pred.shape == lgbm_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class LightGBMClassificationModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_classification( + random_state=3, n_classes=3, n_informative=3, n_features=10 + ) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + X_train = np.concatenate([cls.X_nan, X]) + y_train = np.concatenate([[0, 0], y]) + params = { + "n_estimators": 10, + "task": "train", + "boosting": "gbdt", + "objective": "multiclass", + "num_leaves": 4, + "num_class": 3, + "verbose": -1, + } + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train)) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.lgbm_model) + self.assertEqual(m.n_classes_, 3) + self.assertEqual(m.n_features_in_, 10) + self.assertFalse(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_proba(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict_proba(self.X_test) + lgbm_pred = self.lgbm_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_nan), axis=1) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_shap_contribs(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_contribs=True) + + def test_model_predict_shap_interactions(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_interactions=True) + + def test_model_predict_shap_contribs_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_nan, pred_contribs=True) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class LightGBMClassificationModelBuilder_binaryClassification(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_classification( + random_state=3, n_classes=2, n_informative=3, n_features=10 + ) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + X_train = np.concatenate([cls.X_nan, X]) + y_train = np.concatenate([[0, 0], y]) + params = { + "n_estimators": 10, + "task": "train", + "boosting": "gbdt", + "objective": "binary", + "metric": "binary_logloss", + "num_leaves": 4, + "verbose": -1, + } + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train)) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.lgbm_model) + self.assertEqual(m.n_classes_, 2) + self.assertEqual(m.n_features_in_, 10) + self.assertFalse(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = np.round(self.lgbm_model.predict(self.X_test)).astype(int) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_proba(self): + m = d4p.mb.convert_model(self.lgbm_model) + # predict proba of being class 1 + d4p_pred = m.predict_proba(self.X_test)[:, 1] + lgbm_pred = self.lgbm_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = np.round(self.lgbm_model.predict(self.X_nan)).astype(int) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_proba_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + # predict proba of being class 1 + d4p_pred = m.predict_proba(self.X_nan)[:, 1] + lgbm_pred = self.lgbm_model.predict(self.X_nan) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_shap_contribs(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_contribs=True) + + def test_model_predict_shap_interactions(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_interactions=True) + + def test_model_predict_shap_contribs_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_nan, pred_contribs=True) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +@unittest.skipUnless(cb_available, reason=cb_unavailable_str) +class CatBoostRegressionModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_regression(n_samples=100, n_features=10, random_state=42) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + params = { + "reg_lambda": 1, + "max_depth": 3, + "num_leaves": 2**3, + "verbose": 0, + "objective": "RMSE", + "learning_rate": 0.3, + "n_estimators": 25, + } + cls.cb_model = cb.CatBoost(params) + cls.cb_model.fit(X, y, verbose=0) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.cb_model) + self.assertTrue(hasattr(m, "daal_model_")) + self.assertIsInstance(m.daal_model_, d4p._daal4py.gbt_regression_model) + self.assertEqual(m.daal_model_.NumberOfFeatures, 10) + self.assertEqual(m.daal_model_.NumberOfTrees, 25) + self.assertEqual(m.n_features_in_, 10) + self.assertTrue(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = self.cb_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = self.cb_model.predict(self.X_nan) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_shap_contribs(self): + # SHAP value support from CatBoost models is to be added + with self.assertWarnsRegex( + Warning, + "Models converted from CatBoost cannot be used for SHAP value calculation", + ): + d4p.mb.convert_model(self.cb_model) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +@unittest.skipUnless(cb_available, reason=cb_unavailable_str) +class CatBoostClassificationModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_classification( + n_classes=3, n_features=10, n_informative=3, random_state=42 + ) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + params = { + "reg_lambda": 1, + "max_depth": 3, + "num_leaves": 2**3, + "verbose": 0, + "objective": "MultiClass", + "learning_rate": 0.3, + "n_estimators": 25, + } + cls.cb_model = cb.CatBoost(params) + cls.cb_model.fit(X, y, verbose=0) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.cb_model) + self.assertTrue(hasattr(m, "daal_model_")) + self.assertIsInstance(m.daal_model_, d4p._daal4py.gbt_classification_model) + self.assertEqual(m.daal_model_.NumberOfFeatures, 10) + self.assertEqual(m.daal_model_.NumberOfTrees, 3 * 25) + self.assertEqual(m.n_features_in_, 10) + self.assertFalse(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_test) + cb_pred = self.cb_model.predict(self.X_test, prediction_type="Class").T[0] + np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_nan) + cb_pred = self.cb_model.predict(self.X_nan, prediction_type="Class").T[0] + np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7) + + def test_model_predict_shap_contribs(self): + # SHAP value support from CatBoost models is to be added + with self.assertWarnsRegex( + Warning, + "Models converted from CatBoost cannot be used for SHAP value calculation", + ): + d4p.mb.convert_model(self.cb_model) + + +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +class XGBoostEarlyStopping(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + num_classes = 3 + X, y = make_classification( + n_samples=1500, + n_features=10, + n_informative=3, + n_classes=num_classes, + random_state=42, + ) + X_train, cls.X_test, y_train, cls.y_test = train_test_split( + X, y, test_size=0.5, random_state=42 + ) + + # training parameters setting + params = { + "n_estimators": 100, + "max_bin": 256, + "scale_pos_weight": 2, + "lambda_l2": 1, + "alpha": 0.9, + "max_depth": 8, + "num_leaves": 2**8, + "verbosity": 0, + "objective": "multi:softproba", + "learning_rate": 0.3, + "num_class": num_classes, + "early_stopping_rounds": 5, + "verbose_eval": False, + } + + cls.xgb_clf = xgb.XGBClassifier(**params) + cls.xgb_clf.fit( + X_train, y_train, eval_set=[(cls.X_test, cls.y_test)], verbose=False + ) + cls.daal_model = d4p.mb.convert_model(cls.xgb_clf.get_booster()) + + def test_early_stopping(self): + xgb_prediction = self.xgb_clf.predict(self.X_test) + xgb_proba = self.xgb_clf.predict_proba(self.X_test) + xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(self.y_test)) + + daal_prediction = self.daal_model.predict(self.X_test) + daal_proba = self.daal_model.predict_proba(self.X_test) + daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(self.y_test)) + + self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) + + np.testing.assert_allclose(xgb_proba, daal_proba, rtol=1e-6) + + +class ModelBuilderTreeView(unittest.TestCase): + def test_model_from_booster(self): + class MockBooster: + def get_dump(self, *_, **kwargs): + # raw dump of 2 trees with a max depth of 1 + return [ + ' { "nodeid": 0, "depth": 0, "split": "1", "split_condition": 2, "yes": 1, "no": 2, "missing": 1 , "gain": 3, "cover": 4, "children": [\n { "nodeid": 1, "leaf": 5 , "cover": 6 }, \n { "nodeid": 2, "leaf": 7 , "cover":8 }\n ]}', + ' { "nodeid": 0, "leaf": 0.2 , "cover": 42 }', + ] + + mock = MockBooster() + result = d4p.TreeList.from_xgb_booster(mock, max_trees=0) + self.assertEqual(len(result), 2) + + tree0 = result[0] + self.assertIsInstance(tree0, d4p.TreeView) + self.assertFalse(tree0.is_leaf) + with self.assertRaises(ValueError): + tree0.cover + with self.assertRaises(ValueError): + tree0.value + + self.assertIsInstance(tree0.root_node, d4p.Node) + + self.assertEqual(tree0.root_node.cover, 4) + self.assertEqual(tree0.root_node.left_child.cover, 6) + self.assertEqual(tree0.root_node.right_child.cover, 8) + + self.assertFalse(tree0.root_node.is_leaf) + self.assertTrue(tree0.root_node.left_child.is_leaf) + self.assertTrue(tree0.root_node.right_child.is_leaf) + + self.assertTrue(tree0.root_node.default_left) + self.assertFalse(tree0.root_node.left_child.default_left) + self.assertFalse(tree0.root_node.right_child.default_left) + + self.assertEqual(tree0.root_node.feature, 1) + with self.assertRaises(ValueError): + tree0.root_node.left_child.feature + with self.assertRaises(ValueError): + tree0.root_node.right_child.feature + + self.assertEqual(tree0.root_node.value, 2) + self.assertEqual(tree0.root_node.left_child.value, 5) + self.assertEqual(tree0.root_node.right_child.value, 7) + + self.assertEqual(tree0.root_node.n_children, 2) + self.assertEqual(tree0.root_node.left_child.n_children, 0) + self.assertEqual(tree0.root_node.right_child.n_children, 0) + + self.assertIsNone(tree0.root_node.left_child.left_child) + self.assertIsNone(tree0.root_node.left_child.right_child) + self.assertIsNone(tree0.root_node.right_child.left_child) + self.assertIsNone(tree0.root_node.right_child.right_child) + + tree1 = result[1] + self.assertIsInstance(tree1, d4p.TreeView) + self.assertTrue(tree1.is_leaf) + self.assertEqual(tree1.n_nodes, 1) + self.assertEqual(tree1.cover, 42) + self.assertEqual(tree1.value, 0.2) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_xgboost_mb.py b/tests/test_xgboost_mb.py deleted file mode 100644 index 60ab1b9bdd..0000000000 --- a/tests/test_xgboost_mb.py +++ /dev/null @@ -1,101 +0,0 @@ -# ============================================================================== -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import importlib.util -import unittest - -import numpy as np -from sklearn.datasets import make_classification -from sklearn.model_selection import train_test_split - -import daal4py as d4p -from daal4py import _get__daal_link_version__ as dv -from daal4py.sklearn._utils import daal_check_version - -# First item is major version - 2021, -# second is minor+patch - 0110, -# third item is status - B -daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8])) -reason = str(((2021, "P", 1))) + " not supported in this library version " -reason += str(daal_version) - - -class XgboostModelBuilder(unittest.TestCase): - @unittest.skipUnless( - all( - [ - hasattr(d4p, "get_gbt_model_from_xgboost"), - hasattr(d4p, "gbt_classification_prediction"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) - @unittest.skipUnless( - importlib.util.find_spec("xgboost") is not None, - "xgboost library is not installed", - ) - def test_earlystop(self): - import xgboost as xgb - - num_classes = 3 - X, y = make_classification( - n_samples=1000, - n_features=10, - n_informative=3, - n_classes=num_classes, - random_state=42, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.3, random_state=42 - ) - - # training parameters setting - params = { - "n_estimators": 100, - "max_bin": 256, - "scale_pos_weight": 2, - "lambda_l2": 1, - "alpha": 0.9, - "max_depth": 8, - "num_leaves": 2**8, - "verbosity": 0, - "objective": "multi:softproba", - "learning_rate": 0.3, - "num_class": num_classes, - "early_stopping_rounds": 5, - } - - xgb_clf = xgb.XGBClassifier(**params) - xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) - booster = xgb_clf.get_booster() - - xgb_prediction = xgb_clf.predict(X_test) - xgb_proba = xgb_clf.predict_proba(X_test) - xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test)) - - daal_model = d4p.mb.convert_model(booster) - - daal_prediction = daal_model.predict(X_test) - daal_proba = daal_model.predict_proba(X_test) - daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test)) - - self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) - self.assertTrue(np.allclose(xgb_proba, daal_proba)) - - -if __name__ == "__main__": - unittest.main()