diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml
index fb03f51f53..f62c827c9e 100644
--- a/.ci/pipeline/build-and-test-lnx.yml
+++ b/.ci/pipeline/build-and-test-lnx.yml
@@ -45,7 +45,7 @@ steps:
. /usr/share/miniconda/etc/profile.d/conda.sh
conda activate CB
bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION)
- pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt
+ pip install --upgrade -r requirements-test.txt
pip install $(python .ci/scripts/get_compatible_scipy_version.py)
if [ $(echo $(PYTHON_VERSION) | grep '3.8\|3.9\|3.10') ]; then conda install -q -y -c intel dpnp; fi
pip list
diff --git a/.ci/pipeline/build-and-test-mac.yml b/.ci/pipeline/build-and-test-mac.yml
index c9f6d05345..0df12bc5d0 100644
--- a/.ci/pipeline/build-and-test-mac.yml
+++ b/.ci/pipeline/build-and-test-mac.yml
@@ -40,7 +40,7 @@ steps:
- script: |
source activate CB
bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION)
- pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt
+ pip install --upgrade -r requirements-test.txt
pip install $(python .ci/scripts/get_compatible_scipy_version.py)
pip list
displayName: 'Install testing requirements'
diff --git a/.ci/pipeline/build-and-test-win.yml b/.ci/pipeline/build-and-test-win.yml
index 1bf9d2e365..2875513cb3 100644
--- a/.ci/pipeline/build-and-test-win.yml
+++ b/.ci/pipeline/build-and-test-win.yml
@@ -43,7 +43,7 @@ steps:
set PATH=C:\msys64\usr\bin;%PATH%
call activate CB
bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION)
- pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt
+ pip install --upgrade -r requirements-test.txt
cd ..
for /f "delims=" %%c in ('python s\.ci\scripts\get_compatible_scipy_version.py') do set SCIPY_VERSION=%%c
pip install %SCIPY_VERSION%
diff --git a/.ci/pipeline/nightly.yml b/.ci/pipeline/nightly.yml
index d6ea8393e4..7c3e707cfe 100644
--- a/.ci/pipeline/nightly.yml
+++ b/.ci/pipeline/nightly.yml
@@ -64,7 +64,7 @@ jobs:
conda activate CB
pip install -r dependencies-dev
pip install -r requirements-doc.txt
- pip install -r requirements-test.txt -r requirements-test-optional.txt
+ pip install -r requirements-test.txt
pip install jupyter matplotlib requests
displayName: 'Install requirements'
- script: |
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index ceb64309c4..53dc619061 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -13,17 +13,16 @@ requirements-doc.txt @maria-Petrova @napetrov @aepanchi @Alexsandruss
onedal/ @Alexsandruss @samir-nasibli @KulikovNikita
sklearnex/ @Alexsandruss @samir-nasibli @KulikovNikita
-# Examples
+# Examples
examples/ @maria-Petrova @Alexsandruss @samir-nasibli @napetrov
# Dependencies
setup.py @napetrov @Alexsandruss @samir-nasibli
requirements* @napetrov @Alexsandruss @samir-nasibli @homksei @ahuber21 @ethanglaser
-conda-recipe/ @napetrov @Alexsandruss
+conda-recipe/ @napetrov @Alexsandruss
# Model builders
*model_builders* @razdoburdin @ahuber21 @avolkov-intel
-requirements-test-optional.txt @razdoburdin @ahuber21 @avolkov-intel
# Forests
*ensemble* @ahuber21 @icfaust
diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py
index ce7f82b2e3..bad2eddf19 100644
--- a/daal4py/mb/model_builders.py
+++ b/daal4py/mb/model_builders.py
@@ -200,7 +200,9 @@ def _predict_classification(self, X, fptype, resultsToEvaluate):
else:
return predict_result.probabilities
- def _predict_regression(self, X, fptype):
+ def _predict_regression(
+ self, X, fptype, pred_contribs=False, pred_interactions=False
+ ):
if X.shape[1] != self.n_features_in_:
raise ValueError("Shape of input is different from what was seen in `fit`")
@@ -212,22 +214,64 @@ def _predict_regression(self, X, fptype):
).format(type(self).__name__)
)
- # Prediction
+ try:
+ return self._predict_regression_with_results_to_compute(
+ X, fptype, pred_contribs, pred_interactions
+ )
+ except TypeError as e:
+ if "unexpected keyword argument 'resultsToCompute'" in str(e):
+ if pred_contribs or pred_interactions:
+ # SHAP values requested, but not supported by this version
+ raise TypeError(
+ f"{'pred_contribs' if pred_contribs else 'pred_interactions'} not supported by this version of daalp4y"
+ ) from e
+ else:
+ # unknown type error
+ raise
+
+ # fallback to calculation without `resultsToCompute`
predict_algo = d4p.gbt_regression_prediction(fptype=fptype)
predict_result = predict_algo.compute(X, self.daal_model_)
-
return predict_result.prediction.ravel()
+ def _predict_regression_with_results_to_compute(
+ self, X, fptype, pred_contribs=False, pred_interactions=False
+ ):
+ """Assume daal4py supports the resultsToCompute kwarg"""
+ resultsToCompute = ""
+ if pred_contribs:
+ resultsToCompute = "shapContributions"
+ elif pred_interactions:
+ resultsToCompute = "shapInteractions"
+
+ predict_algo = d4p.gbt_regression_prediction(
+ fptype=fptype, resultsToCompute=resultsToCompute
+ )
+ predict_result = predict_algo.compute(X, self.daal_model_)
+
+ if pred_contribs:
+ return predict_result.prediction.ravel().reshape((-1, X.shape[1] + 1))
+ elif pred_interactions:
+ return predict_result.prediction.ravel().reshape(
+ (-1, X.shape[1] + 1, X.shape[1] + 1)
+ )
+ else:
+ return predict_result.prediction.ravel()
+
class GBTDAALModel(GBTDAALBaseModel):
def __init__(self):
pass
- def predict(self, X):
+ def predict(self, X, pred_contribs=False, pred_interactions=False):
fptype = getFPType(X)
if self._is_regression:
- return self._predict_regression(X, fptype)
+ return self._predict_regression(X, fptype, pred_contribs, pred_interactions)
else:
+ if pred_contribs or pred_interactions:
+ raise NotImplementedError(
+ f"{'pred_contribs' if pred_contribs else 'pred_interactions'} is not implemented for classification models"
+ )
return self._predict_classification(X, fptype, "computeClassLabels")
def predict_proba(self, X):
diff --git a/doc/daal4py/model-builders.rst b/doc/daal4py/model-builders.rst
index 3d1f9e7b26..4e89b2a849 100644
--- a/doc/daal4py/model-builders.rst
+++ b/doc/daal4py/model-builders.rst
@@ -24,17 +24,17 @@ Model Builders for the Gradient Boosting Frameworks
Introduction
------------------
-Gradient boosting on decision trees is one of the most accurate and efficient
-machine learning algorithms for classification and regression.
-The most popular implementations of it are:
+Gradient boosting on decision trees is one of the most accurate and efficient
+machine learning algorithms for classification and regression.
+The most popular implementations of it are:
* XGBoost*
* LightGBM*
* CatBoost*
daal4py Model Builders deliver the accelerated
-models inference of those frameworks. The inference is performed by the oneDAL GBT implementation tuned
-for the best performance on the Intel(R) Architecture.
+models inference of those frameworks. The inference is performed by the oneDAL GBT implementation tuned
+for the best performance on the Intel(R) Architecture.
Conversion
---------
@@ -61,22 +61,49 @@ CatBoost::
Classification and Regression Inference
----------------------------------------
-The API is the same for classification and regression inference.
-Based on the original model passed to the ``convert_model``, ``d4p_prediction`` is either the classification or regression output.
-
+The API is the same for classification and regression inference.
+Based on the original model passed to the ``convert_model()``, ``d4p_prediction`` is either the classification or regression output.
+
::
-
+
d4p_prediction = d4p_model.predict(test_data)
Here, the ``predict()`` method of ``d4p_model`` is being used to make predictions on the ``test_data`` dataset.
-The ``d4p_prediction`` variable stores the predictions made by the ``predict()`` method.
+The ``d4p_prediction`` variable stores the predictions made by the ``predict()`` method.
+
+SHAP Value Calculation for Regression Models
+------------------------------------------------------------
+
+SHAP contribution and interaction value calculation are natively supported by models created with daal4py Model Builders.
+For these models, the ``predict()`` method takes additional keyword arguments:
+
+ ::
+
+ d4p_model.predict(test_data, pred_contribs=True) # for SHAP contributions
+ d4p_model.predict(test_data, pred_interactions=True) # for SHAP interactions
+
+The returned prediction has the shape:
+
+ * ``(n_rows, n_features + 1)`` for SHAP contributions
+ * ``(n_rows, n_features + 1, n_features + 1)`` for SHAP interactions
+Here, ``n_rows`` is the number of rows (i.e., observations) in
+``test_data``, and ``n_features`` is the number of features in the dataset.
+
+The prediction result for SHAP contributions includes a feature attribution value for each feature and a bias term for each observation.
+
+The prediction result for SHAP interactions comprises ``(n_features + 1) x (n_features + 1)`` values for all possible
+feature combinations, along with their corresponding bias terms.
+
+.. note:: The shapes of SHAP contributions and interactions are consistent with the XGBoost results.
+ In contrast, the `SHAP Python package `_ drops bias terms, resulting
+ in SHAP contributions (SHAP interactions) with one fewer column (one fewer column and row) per observation.
Scikit-learn-style Estimators
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You can also use the scikit-learn-style classes ``GBTDAALClassifier`` and ``GBTDAALRegressor`` to convert and infer your models. For example:
-::
+::
from daal4py.sklearn.ensemble import GBTDAALRegressor
reg = xgb.XGBRegressor()
@@ -88,9 +115,9 @@ Limitations
------------------
Model Builders support only base inference with prediction and probabilities prediction. The functionality is to be extended.
Therefore, there are the following limitations:
-- The categorical features are not supported for conversion and prediction.
+- The categorical features are not supported for conversion and prediction.
- The multioutput models are not supported for conversion and prediction.
-- The tree SHAP calculations are not supported.
+- SHAP values can be calculated for regression models only.
Examples
@@ -98,6 +125,7 @@ Examples
Model Builders models conversion
- `XGBoost model conversion `_
+- `SHAP value prediction from an XGBoost model `_
- `LightGBM model conversion `_
- `CatBoost model conversion `_
diff --git a/examples/daal4py/log_reg_model_builder.py b/examples/mb/log_reg_model_builder.py
similarity index 100%
rename from examples/daal4py/log_reg_model_builder.py
rename to examples/mb/log_reg_model_builder.py
diff --git a/examples/daal4py/model_builders_catboost.py b/examples/mb/model_builders_catboost.py
similarity index 100%
rename from examples/daal4py/model_builders_catboost.py
rename to examples/mb/model_builders_catboost.py
diff --git a/examples/daal4py/model_builders_lightgbm.py b/examples/mb/model_builders_lightgbm.py
similarity index 100%
rename from examples/daal4py/model_builders_lightgbm.py
rename to examples/mb/model_builders_lightgbm.py
diff --git a/examples/daal4py/model_builders_xgboost.py b/examples/mb/model_builders_xgboost.py
similarity index 100%
rename from examples/daal4py/model_builders_xgboost.py
rename to examples/mb/model_builders_xgboost.py
diff --git a/examples/mb/model_builders_xgboost_shap.py b/examples/mb/model_builders_xgboost_shap.py
new file mode 100644
index 0000000000..7780714fd5
--- /dev/null
+++ b/examples/mb/model_builders_xgboost_shap.py
@@ -0,0 +1,80 @@
+# ==============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# daal4py Gradient Boosting Classification model creation and SHAP value
+# prediction example
+
+import numpy as np
+import xgboost as xgb
+from sklearn.datasets import make_regression
+from sklearn.model_selection import train_test_split
+
+import daal4py as d4p
+
+
+def main(*ars, **kwargs):
+ # create data
+ X, y = make_regression(n_samples=10000, n_features=10, random_state=42)
+ X_train, X_test, y_train, _ = train_test_split(X, y, random_state=42)
+
+ # train the model
+ xgb_model = xgb.XGBRegressor(
+ max_depth=6, n_estimators=100, random_state=42, base_score=0.5
+ )
+ xgb_model.fit(X_train, y_train)
+
+ # Conversion to daal4py
+ daal_model = d4p.mb.convert_model(xgb_model.get_booster())
+
+ # SHAP contributions
+ daal_contribs = daal_model.predict(X_test, pred_contribs=True)
+
+ # SHAP interactions
+ daal_interactions = daal_model.predict(X_test, pred_interactions=True)
+
+ # XGBoost reference values
+ xgb_contribs = xgb_model.get_booster().predict(
+ xgb.DMatrix(X_test), pred_contribs=True, validate_features=False
+ )
+ xgb_interactions = xgb_model.get_booster().predict(
+ xgb.DMatrix(X_test), pred_interactions=True, validate_features=False
+ )
+
+ return (
+ daal_contribs,
+ daal_interactions,
+ xgb_contribs,
+ xgb_interactions,
+ )
+
+
+if __name__ == "__main__":
+ daal_contribs, daal_interactions, xgb_contribs, xgb_interactions = main()
+ print(f"XGBoost SHAP contributions shape: {xgb_contribs.shape}")
+ print(f"daal4py SHAP contributions shape: {daal_contribs.shape}")
+
+ print(f"XGBoost SHAP interactions shape: {xgb_interactions.shape}")
+ print(f"daal4py SHAP interactions shape: {daal_interactions.shape}")
+
+ contribution_rmse = np.sqrt(
+ np.mean((daal_contribs.reshape(-1, 1) - xgb_contribs.reshape(-1, 1)) ** 2)
+ )
+ print(f"SHAP contributions RMSE: {contribution_rmse:.2e}")
+
+ interaction_rmse = np.sqrt(
+ np.mean((daal_interactions.reshape(-1, 1) - xgb_interactions.reshape(-1, 1)) ** 2)
+ )
+ print(f"SHAP interactions RMSE: {interaction_rmse:.2e}")
diff --git a/generator/parse.py b/generator/parse.py
index a0fda84af3..6611c59c94 100644
--- a/generator/parse.py
+++ b/generator/parse.py
@@ -283,8 +283,14 @@ def parse(self, elem, ctxt):
ctxt.enum = False
return True
regex = (
- r"^\s*(\w+)(?:\s*=\s*((\(int\))?\w(\w|:|\s|\+)*))?"
- + r"(\s*,)?\s*((/\*|//).*)?$"
+ # capture group for value name
+ r"^\s*(\w+)"
+ # capture group for value (different possible formats, 123, 0x1, (1 << 5), etc.)
+ + r"(?:\s*=\s*((\(int\))?(\w|:|\s|\+|\(?\d+\s*<<\s*\d+\)?)*))?"
+ # comma after the value, plus possible comments
+ + r"(\s*,)?\s*((/\*|//).*)?"
+ # EOL
+ + r"$"
)
me = re.match(regex, elem)
if me and not me.group(1).startswith("last"):
diff --git a/requirements-test-optional.txt b/requirements-test-optional.txt
deleted file mode 100644
index 45e2575ef4..0000000000
--- a/requirements-test-optional.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-xgboost==1.7.6; python_version <= '3.9'
-xgboost==2.0.0; python_version >= '3.10'
-lightgbm==4.1.0
-catboost==1.2.2; python_version <= '3.11'
diff --git a/requirements-test.txt b/requirements-test.txt
index 10d61ade83..fc9c0ad4eb 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -6,3 +6,8 @@ scikit-learn==1.2.2 ; python_version == '3.8'
scikit-learn==1.3.1 ; python_version >= '3.9'
pandas==2.0.1 ; python_version == '3.8'
pandas==2.1.1 ; python_version >= '3.9'
+xgboost==1.7.6; python_version <= '3.9'
+xgboost==2.0.0; python_version >= '3.10'
+lightgbm==4.1.0
+catboost==1.2.2; python_version <= '3.11' # FIXME: Add as soon as 3.12 is supported
+shap==0.42.1; python_version <= '3.11' # FIXME: Add as soon as 3.12 is supported
diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx
index b6ed202037..c031e983ee 100755
--- a/src/gbt_convertors.pyx
+++ b/src/gbt_convertors.pyx
@@ -1,507 +1,720 @@
-#===============================================================================
-# Copyright 2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-import json
-import re
-from collections import deque
-from os import getpid, remove
-from time import time
-from typing import Any, Deque, Dict, List
-
-
-def get_lightgbm_params(booster):
- return booster.dump_model()
-
-def get_xgboost_params(booster):
- return json.loads(booster.save_config())
-
-def get_catboost_params(booster):
- dump_filename = f"catboost_model_{getpid()}_{time()}"
-
- # Dump model in file
- booster.save_model(dump_filename, 'json')
-
- # Read json with model
- with open(dump_filename) as file:
- model_data = json.load(file)
-
- # Delete dump file
- remove(dump_filename)
- return model_data
-
-def get_gbt_model_from_lightgbm(model: Any, lgb_model = None) -> Any:
- class Node:
- def __init__(self, tree: Dict[str, Any], parent_id: int, position: int):
- self.tree = tree
- self.parent_id = parent_id
- self.position = position
-
- if lgb_model is None:
- lgb_model = get_lightgbm_params(model)
-
- n_features = lgb_model["max_feature_idx"] + 1
- n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"]
- n_classes = lgb_model["num_tree_per_iteration"]
-
- is_regression = False
- objective_fun = lgb_model["objective"]
- if n_classes > 2:
- if "multiclass" not in objective_fun:
- raise TypeError(
- "multiclass (softmax) objective is only supported for multiclass classification")
- elif "binary" in objective_fun: # nClasses == 1
- n_classes = 2
- else:
- is_regression = True
-
- if is_regression:
- mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations)
- else:
- mb = gbt_clf_model_builder(
- n_features=n_features, n_iterations=n_iterations, n_classes=n_classes)
-
- class_label = 0
- iterations_counter = 0
- for tree in lgb_model["tree_info"]:
- if is_regression:
- tree_id = mb.create_tree(tree["num_leaves"]*2-1)
- else:
- tree_id = mb.create_tree(n_nodes=tree["num_leaves"]*2-1, class_label=class_label)
-
- iterations_counter += 1
- if iterations_counter == n_iterations:
- iterations_counter = 0
- class_label += 1
- sub_tree = tree["tree_structure"]
-
- # root is leaf
- if "leaf_value" in sub_tree:
- mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"])
- continue
-
- # add root
- feat_val = sub_tree["threshold"]
- if isinstance(feat_val, str):
- raise NotImplementedError(
- "Categorical features are not supported in daal4py Gradient Boosting Trees")
- default_left = int(sub_tree["default_left"])
- parent_id = mb.add_split(
- tree_id=tree_id, feature_index=sub_tree["split_feature"],
- feature_value=feat_val, default_left=default_left)
-
- # create stack
- node_stack: List[Node] = [Node(sub_tree["left_child"], parent_id, 0),
- Node(sub_tree["right_child"], parent_id, 1)]
-
- # dfs through it
- while node_stack:
- sub_tree = node_stack[-1].tree
- parent_id = node_stack[-1].parent_id
- position = node_stack[-1].position
- node_stack.pop()
-
- # current node is leaf
- if "leaf_index" in sub_tree:
- mb.add_leaf(
- tree_id=tree_id, response=sub_tree["leaf_value"],
- parent_id=parent_id, position=position)
- continue
-
- # current node is split
- feat_val = sub_tree["threshold"]
- if isinstance(feat_val, str):
- raise NotImplementedError(
- "Categorical features are not supported in daal4py Gradient Boosting Trees")
- default_left = int(sub_tree["default_left"])
- parent_id = mb.add_split(
- tree_id=tree_id, feature_index=sub_tree["split_feature"],
- feature_value=feat_val,
- default_left=default_left,
- parent_id=parent_id, position=position)
-
- # append children
- node_stack.append(Node(sub_tree["left_child"], parent_id, 0))
- node_stack.append(Node(sub_tree["right_child"], parent_id, 1))
-
- return mb.model()
-
-
-def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any:
- class Node:
- def __init__(self, tree: Dict, parent_id: int, position: int):
- self.tree = tree
- self.parent_id = parent_id
- self.position = position
-
- # Release Note for XGBoost 1.5.0: Python interface now supports configuring
- # constraints using feature names instead of feature indices. This also
- # helps with pandas input with set feature names.
- lst = [*range(booster.num_features())]
- booster.feature_names = [str(i) for i in lst]
-
- trees_arr = booster.get_dump(dump_format="json")
- if xgb_config is None:
- xgb_config = get_xgboost_params(booster)
-
-
- n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"])
- n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"])
- base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"])
-
- is_regression = False
- objective_fun = xgb_config["learner"]["learner_train_param"]["objective"]
- if n_classes > 2:
- if objective_fun not in ["multi:softprob", "multi:softmax"]:
- raise TypeError(
- "multi:softprob and multi:softmax are only supported for multiclass classification")
- elif objective_fun.find("binary:") == 0:
- if objective_fun in ["binary:logistic", "binary:logitraw"]:
- n_classes = 2
- else:
- raise TypeError(
- "binary:logistic and binary:logitraw are only supported for binary classification")
- else:
- is_regression = True
-
- if hasattr(booster, "best_iteration"):
- n_iterations = booster.best_iteration + 1
- trees_arr = trees_arr[: n_iterations * (n_classes if n_classes > 2 else 1)]
- else:
- n_iterations = int(len(trees_arr) / (n_classes if n_classes > 2 else 1))
-
- # Create + base iteration
- if is_regression:
- mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1)
-
- tree_id = mb.create_tree(1)
- mb.add_leaf(tree_id=tree_id, response=base_score)
- else:
- mb = gbt_clf_model_builder(
- n_features=n_features, n_iterations=n_iterations, n_classes=n_classes)
-
- class_label = 0
- iterations_counter = 0
- mis_eq_yes = None
- for tree in trees_arr:
- n_nodes = 1
- # find out the number of nodes in the tree
- for node in tree.split("nodeid")[1:]:
- node_id = int(node[3:node.find(",")])
- if node_id + 1 > n_nodes:
- n_nodes = node_id + 1
- if is_regression:
- tree_id = mb.create_tree(n_nodes)
- else:
- tree_id = mb.create_tree(n_nodes=n_nodes, class_label=class_label)
-
- iterations_counter += 1
- if iterations_counter == n_iterations:
- iterations_counter = 0
- class_label += 1
- sub_tree = json.loads(tree)
-
- # root is leaf
- if "leaf" in sub_tree:
- mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf"])
- continue
-
- # add root
- try:
- feature_index = int(sub_tree["split"])
- except ValueError:
- raise TypeError("Feature names must be integers")
- feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf))
- default_left = int(sub_tree["yes"] == sub_tree["missing"])
- parent_id = mb.add_split(tree_id=tree_id, feature_index=feature_index,
- feature_value=feature_value, default_left=default_left)
-
- # create queue
- node_queue: Deque[Node] = deque()
- node_queue.append(Node(sub_tree["children"][0], parent_id, 0))
- node_queue.append(Node(sub_tree["children"][1], parent_id, 1))
-
- # bfs through it
- while node_queue:
- sub_tree = node_queue[0].tree
- parent_id = node_queue[0].parent_id
- position = node_queue[0].position
- node_queue.popleft()
-
- # current node is leaf
- if "leaf" in sub_tree:
- mb.add_leaf(
- tree_id=tree_id, response=sub_tree["leaf"],
- parent_id=parent_id, position=position)
- continue
-
- # current node is split
- try:
- feature_index = int(sub_tree["split"])
- except ValueError:
- raise TypeError("Feature names must be integers")
- feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf))
- default_left = int(sub_tree["yes"] == sub_tree["missing"])
-
- parent_id = mb.add_split(
- tree_id=tree_id, feature_index=feature_index, feature_value=feature_value,
- default_left=default_left, parent_id=parent_id, position=position)
-
- # append to queue
- node_queue.append(Node(sub_tree["children"][0], parent_id, 0))
- node_queue.append(Node(sub_tree["children"][1], parent_id, 1))
-
- return mb.model()
-
-def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any:
- if not model.is_fitted():
- raise RuntimeError(
- "Model should be fitted before exporting to daal4py.")
-
- if model_data is None:
- model_data = get_catboost_params(model)
-
- if 'categorical_features' in model_data['features_info']:
- raise NotImplementedError(
- "Categorical features are not supported in daal4py Gradient Boosting Trees")
-
- n_features = len(model_data['features_info']['float_features'])
-
- is_symmetric_tree = model_data['model_info']['params']['tree_learner_options']['grow_policy'] == 'SymmetricTree'
-
- if is_symmetric_tree:
- n_iterations = len(model_data['oblivious_trees'])
- else:
- n_iterations = len(model_data['trees'])
-
- n_classes = 0
-
- if 'class_params' in model_data['model_info']:
- is_classification = True
- n_classes = len(model_data['model_info']
- ['class_params']['class_to_label'])
- mb = gbt_clf_model_builder(
- n_features=n_features, n_iterations=n_iterations, n_classes=n_classes)
- else:
- is_classification = False
- mb = gbt_reg_model_builder(n_features, n_iterations)
-
- splits = []
-
- # Create splits array (all splits are placed sequentially)
- for feature in model_data['features_info']['float_features']:
- if feature['borders']:
- for feature_border in feature['borders']:
- splits.append(
- {'feature_index': feature['feature_index'], 'value': feature_border})
-
- if not is_classification:
- bias = model_data['scale_and_bias'][1][0] / n_iterations
- scale = model_data['scale_and_bias'][0]
- else:
- bias = 0
- scale = 1
-
- trees_explicit = []
- tree_symmetric = []
-
- if model_data['model_info']['params']['data_processing_options']['float_features_binarization']['nan_mode'] == 'Min':
- default_left = 1
- else:
- default_left = 0
-
- for tree_num in range(n_iterations):
- if is_symmetric_tree:
-
- if model_data['oblivious_trees'][tree_num]['splits'] is not None:
- # Tree has more than 1 node
- cur_tree_depth = len(
- model_data['oblivious_trees'][tree_num]['splits'])
- else:
- cur_tree_depth = 0
-
- tree_symmetric.append(
- (model_data['oblivious_trees'][tree_num], cur_tree_depth))
- else:
- class Node:
- def __init__(self, parent=None, split=None, value=None) -> None:
- self.right = None
- self.left = None
- self.split = split
- self.value = value
-
- n_nodes = 1
- # Check if node is a leaf (in case of stump)
- if 'split' in model_data['trees'][tree_num]:
- # Get number of trees and splits info via BFS
- # Create queue
- nodes_queue = []
- root_node = Node(
- split=splits[model_data['trees'][tree_num]['split']['split_index']])
- nodes_queue.append((model_data['trees'][tree_num], root_node))
- while nodes_queue:
- cur_node_data, cur_node = nodes_queue.pop(0)
- if 'value' in cur_node_data:
- if isinstance(cur_node_data['value'], list):
- cur_node.value = [
- value for value in cur_node_data['value']]
- else:
- cur_node.value = [
- cur_node_data['value'] * scale + bias]
- else:
- cur_node.split = splits[cur_node_data['split']
- ['split_index']]
- left_node = Node()
- right_node = Node()
- cur_node.left = left_node
- cur_node.right = right_node
- nodes_queue.append((cur_node_data['left'], left_node))
- nodes_queue.append(
- (cur_node_data['right'], right_node))
- n_nodes += 2
- else:
- root_node = Node()
- if is_classification and n_classes > 2:
- root_node.value = [
- value * scale for value in model_data['trees'][tree_num]['value']]
- else:
- root_node.value = [model_data['trees'][tree_num]['value'] * scale + bias]
- trees_explicit.append((root_node, n_nodes))
-
- tree_id = []
- class_label = 0
- count = 0
-
- # Only 1 tree for each iteration in case of regression or binary classification
- if not is_classification or n_classes == 2:
- n_tree_each_iter = 1
- else:
- n_tree_each_iter = n_classes
-
- # Create id for trees (for the right order in modelbuilder)
- for i in range(n_iterations):
- for c in range(n_tree_each_iter):
- if is_symmetric_tree:
- n_nodes = 2**(tree_symmetric[i][1] + 1) - 1
- else:
- n_nodes = trees_explicit[i][1]
-
- if is_classification and n_classes > 2:
- tree_id.append(mb.create_tree(n_nodes, class_label))
- count += 1
- if count == n_iterations:
- class_label += 1
- count = 0
-
- elif is_classification:
- tree_id.append(mb.create_tree(n_nodes, 0))
- else:
- tree_id.append(mb.create_tree(n_nodes))
-
-
- if is_symmetric_tree:
- for class_label in range(n_tree_each_iter):
- for i in range(n_iterations):
- cur_tree_info = tree_symmetric[i][0]
- cur_tree_id = tree_id[i * n_tree_each_iter + class_label]
- cur_tree_leaf_val = cur_tree_info['leaf_values']
- cur_tree_depth = tree_symmetric[i][1]
-
- if cur_tree_depth == 0:
- mb.add_leaf(
- tree_id=cur_tree_id, response=cur_tree_leaf_val[0])
- else:
- # One split used for the whole level
- cur_level_split = splits[cur_tree_info['splits']
- [cur_tree_depth - 1]['split_index']]
- root_id = mb.add_split(
- tree_id=cur_tree_id, feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'],
- default_left=default_left)
- prev_level_nodes = [root_id]
-
- # Iterate over levels, splits in json are reversed (root split is the last)
- for cur_level in range(cur_tree_depth - 2, -1, -1):
- cur_level_nodes = []
- for cur_parent in prev_level_nodes:
- cur_level_split = splits[cur_tree_info['splits']
- [cur_level]['split_index']]
- cur_left_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=0,
- feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'],
- default_left=default_left)
- cur_right_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=1,
- feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'],
- default_left=default_left)
- cur_level_nodes.append(cur_left_node)
- cur_level_nodes.append(cur_right_node)
- prev_level_nodes = cur_level_nodes
-
- # Different storing format for leaves
- if not is_classification or n_classes == 2:
- for last_level_node_num in range(len(prev_level_nodes)):
- mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num]
- * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0)
- mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num + 1]
- * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1)
- else:
- for last_level_node_num in range(len(prev_level_nodes)):
- left_index = 2 * last_level_node_num * n_tree_each_iter + class_label
- right_index = (2 * last_level_node_num + 1) * \
- n_tree_each_iter + class_label
- mb.add_leaf(
- tree_id=cur_tree_id, response=cur_tree_leaf_val[left_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0)
- mb.add_leaf(
- tree_id=cur_tree_id, response=cur_tree_leaf_val[right_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1)
- else:
- for class_label in range(n_tree_each_iter):
- for i in range(n_iterations):
- root_node = trees_explicit[i][0]
-
- cur_tree_id = tree_id[i * n_tree_each_iter + class_label]
- # Traverse tree via BFS and build tree with modelbuilder
- if root_node.value is None:
- root_id = mb.add_split(
- tree_id=cur_tree_id, feature_index=root_node.split['feature_index'], feature_value=root_node.split['value'],
- default_left=default_left)
- nodes_queue = [(root_node, root_id)]
- while nodes_queue:
- cur_node, cur_node_id = nodes_queue.pop(0)
- left_node = cur_node.left
- # Check if node is a leaf
- if left_node.value is None:
- left_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=0,
- feature_index=left_node.split['feature_index'], feature_value=left_node.split['value'],
- default_left=default_left)
- nodes_queue.append((left_node, left_node_id))
- else:
- mb.add_leaf(
- tree_id=cur_tree_id, response=left_node.value[class_label], parent_id=cur_node_id, position=0)
- right_node = cur_node.right
- # Check if node is a leaf
- if right_node.value is None:
- right_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=1,
- feature_index=right_node.split['feature_index'], feature_value=right_node.split['value'],
- default_left=default_left)
- nodes_queue.append((right_node, right_node_id))
- else:
- mb.add_leaf(
- tree_id=cur_tree_id, response=cur_node.right.value[class_label],
- parent_id=cur_node_id, position=1)
-
- else:
- # Tree has only one node
- mb.add_leaf(tree_id=cur_tree_id,
- response=root_node.value[class_label])
-
- return mb.model()
+# ===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===============================================================================
+
+import json
+from collections import deque
+from tempfile import NamedTemporaryFile
+from typing import Any, Deque, Dict, List, Optional, Tuple
+from warnings import warn
+
+import numpy as np
+
+
+class CatBoostNode:
+ def __init__(
+ self,
+ split: Optional[Dict] = None,
+ value: Optional[List[float]] = None,
+ right: Optional[int] = None,
+ left: Optional[float] = None,
+ cover: Optional[float] = None,
+ ) -> None:
+ self.split = split
+ self.value = value
+ self.right = right
+ self.left = left
+ self.cover = cover
+
+
+class Node:
+ """Helper class holding Tree Node information"""
+
+ def __init__(
+ self,
+ cover: float,
+ is_leaf: bool,
+ default_left: bool,
+ feature: int,
+ value: float,
+ n_children: int = 0,
+ left_child: "Optional[Node]" = None,
+ right_child: "Optional[Node]" = None,
+ parent_id: Optional[int] = -1,
+ position: Optional[int] = -1,
+ ) -> None:
+ self.cover = cover
+ self.is_leaf = is_leaf
+ self.default_left = default_left
+ self.__feature = feature
+ self.value = value
+ self.n_children = n_children
+ self.left_child = left_child
+ self.right_child = right_child
+ self.parent_id = parent_id
+ self.position = position
+
+ @staticmethod
+ def from_xgb_dict(input_dict: Dict[str, Any]) -> "Node":
+ if "children" in input_dict:
+ left_child = Node.from_xgb_dict(input_dict["children"][0])
+ right_child = Node.from_xgb_dict(input_dict["children"][1])
+ n_children = 2 + left_child.n_children + right_child.n_children
+ else:
+ left_child = None
+ right_child = None
+ n_children = 0
+ is_leaf = "leaf" in input_dict
+ default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"]
+ return Node(
+ cover=input_dict["cover"],
+ is_leaf=is_leaf,
+ default_left=default_left,
+ feature=input_dict.get("split"),
+ value=input_dict["leaf"] if is_leaf else input_dict["split_condition"],
+ n_children=n_children,
+ left_child=left_child,
+ right_child=right_child,
+ )
+
+ @staticmethod
+ def from_lightgbm_dict(input_dict: Dict[str, Any]) -> "Node":
+ if "tree_structure" in input_dict:
+ tree = input_dict["tree_structure"]
+ else:
+ tree = input_dict
+
+ n_children = 0
+ if "left_child" in tree:
+ left_child = Node.from_lightgbm_dict(tree["left_child"])
+ n_children += 1 + left_child.n_children
+ else:
+ left_child = None
+ if "right_child" in tree:
+ right_child = Node.from_lightgbm_dict(tree["right_child"])
+ n_children += 1 + right_child.n_children
+ else:
+ right_child = None
+
+ is_leaf = "leaf_value" in tree
+ return Node(
+ cover=tree["leaf_count"] if is_leaf else tree["internal_count"],
+ is_leaf=is_leaf,
+ default_left=tree.get("default_left", 0),
+ feature=tree.get("split_feature"),
+ value=tree["leaf_value"] if is_leaf else tree["threshold"],
+ n_children=n_children,
+ left_child=left_child,
+ right_child=right_child,
+ )
+
+ def get_value_closest_float_downward(self) -> np.float64:
+ """Get the closest exact fp value smaller than self.value"""
+ return np.nextafter(np.single(self.value), np.single(-np.inf))
+
+ def get_children(self) -> "Optional[Tuple[Node, Node]]":
+ if not self.left_child or not self.right_child:
+ assert self.is_leaf
+ else:
+ return (self.left_child, self.right_child)
+
+ @property
+ def feature(self) -> int:
+ if isinstance(self.__feature, int):
+ return self.__feature
+ if isinstance(self.__feature, str) and self.__feature.isnumeric():
+ return int(self.__feature)
+ raise ValueError(
+ f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})"
+ )
+
+
+class TreeView:
+ """Helper class, treating a list of nodes as one tree"""
+
+ def __init__(self, tree_id: int, root_node: Node) -> None:
+ self.tree_id = tree_id
+ self.root_node = root_node
+
+ @property
+ def is_leaf(self) -> bool:
+ return self.root_node.is_leaf
+
+ @property
+ def value(self) -> float:
+ if not self.is_leaf:
+ raise ValueError("Tree is not a leaf-only tree")
+ if self.root_node.value is None:
+ raise ValueError("Tree is leaf-only but leaf node has no value")
+ return self.root_node.value
+
+ @property
+ def cover(self) -> float:
+ if not self.is_leaf:
+ raise ValueError("Tree is not a leaf-only tree")
+ return self.root_node.cover
+
+ @property
+ def n_nodes(self) -> int:
+ return self.root_node.n_children + 1
+
+
+class TreeList(list):
+ """Helper class that is able to extract all information required by the
+ model builders from various objects"""
+
+ @staticmethod
+ def from_xgb_booster(booster, max_trees: int) -> "TreeList":
+ """
+ Load a TreeList from an xgb.Booster object
+ Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code,
+ therefore not type hint is added.
+ """
+ tl = TreeList()
+ dump = booster.get_dump(dump_format="json", with_stats=True)
+ for tree_id, raw_tree in enumerate(dump):
+ if max_trees > 0 and tree_id == max_trees:
+ break
+ raw_tree_parsed = json.loads(raw_tree)
+ root_node = Node.from_xgb_dict(raw_tree_parsed)
+ tl.append(TreeView(tree_id=tree_id, root_node=root_node))
+
+ return tl
+
+ @staticmethod
+ def from_lightgbm_booster_dump(dump: Dict[str, Any]) -> "TreeList":
+ """
+ Load a TreeList from a lgbm Booster dump
+ Note: We cannot type-hint the the Model without loading lightgbm as dependency in pyx code,
+ therefore not type hint is added.
+ """
+ tl = TreeList()
+ for tree_id, tree_dict in enumerate(dump["tree_info"]):
+ root_node = Node.from_lightgbm_dict(tree_dict)
+ tl.append(TreeView(tree_id=tree_id, root_node=root_node))
+
+ return tl
+
+ def __setitem__(self):
+ raise NotImplementedError(
+ "Use TreeList.from_*() methods to initialize a TreeList"
+ )
+
+
+def get_lightgbm_params(booster):
+ return booster.dump_model()
+
+
+def get_xgboost_params(booster):
+ return json.loads(booster.save_config())
+
+
+def get_catboost_params(booster):
+ with NamedTemporaryFile() as fp:
+ booster.save_model(fp.name, "json")
+ fp.seek(0)
+ model_data = json.load(fp)
+ return model_data
+
+
+def get_gbt_model_from_tree_list(
+ tree_list: TreeList,
+ n_iterations: int,
+ is_regression: bool,
+ n_features: int,
+ n_classes: int,
+ base_score: Optional[float] = None,
+):
+ """Return a GBT Model from TreeList"""
+
+ if is_regression:
+ mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations)
+ else:
+ mb = gbt_clf_model_builder(
+ n_features=n_features, n_iterations=n_iterations, n_classes=n_classes
+ )
+
+ class_label = 0
+ for counter, tree in enumerate(tree_list, start=1):
+ # find out the number of nodes in the tree
+ if is_regression:
+ tree_id = mb.create_tree(tree.n_nodes)
+ else:
+ tree_id = mb.create_tree(n_nodes=tree.n_nodes, class_label=class_label)
+
+ if counter % n_iterations == 0:
+ class_label += 1
+
+ if tree.is_leaf:
+ mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover)
+ continue
+
+ root_node = tree.root_node
+ parent_id = mb.add_split(
+ tree_id=tree_id,
+ feature_index=root_node.feature,
+ feature_value=root_node.get_value_closest_float_downward(),
+ cover=root_node.cover,
+ default_left=root_node.default_left,
+ )
+
+ # create queue
+ node_queue: Deque[Node] = deque()
+ children = root_node.get_children()
+ assert children is not None
+ for position, child in enumerate(children):
+ child.parent_id = parent_id
+ child.position = position
+ node_queue.append(child)
+
+ while node_queue:
+ node = node_queue.popleft()
+ assert node.parent_id != -1, "node.parent_id must not be -1"
+ assert node.position != -1, "node.position must not be -1"
+
+ if node.is_leaf:
+ mb.add_leaf(
+ tree_id=tree_id,
+ response=node.value,
+ cover=node.cover,
+ parent_id=node.parent_id,
+ position=node.position,
+ )
+ else:
+ parent_id = mb.add_split(
+ tree_id=tree_id,
+ feature_index=node.feature,
+ feature_value=node.get_value_closest_float_downward(),
+ cover=node.cover,
+ default_left=node.default_left,
+ parent_id=node.parent_id,
+ position=node.position,
+ )
+
+ children = node.get_children()
+ assert children is not None
+ for position, child in enumerate(children):
+ child.parent_id = parent_id
+ child.position = position
+ node_queue.append(child)
+
+ return mb.model(base_score=base_score)
+
+
+def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any:
+ if booster is None:
+ booster = model.dump_model()
+
+ n_features = booster["max_feature_idx"] + 1
+ n_iterations = len(booster["tree_info"]) / booster["num_tree_per_iteration"]
+ n_classes = booster["num_tree_per_iteration"]
+
+ is_regression = False
+ objective_fun = booster["objective"]
+ if n_classes > 2:
+ if "multiclass" not in objective_fun:
+ raise TypeError(
+ "multiclass (softmax) objective is only supported for multiclass classification"
+ )
+ elif "binary" in objective_fun: # nClasses == 1
+ n_classes = 2
+ else:
+ is_regression = True
+
+ tree_list = TreeList.from_lightgbm_booster_dump(booster)
+
+ return get_gbt_model_from_tree_list(
+ tree_list,
+ n_iterations=n_iterations,
+ is_regression=is_regression,
+ n_features=n_features,
+ n_classes=n_classes,
+ )
+
+
+def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any:
+ # Release Note for XGBoost 1.5.0: Python interface now supports configuring
+ # constraints using feature names instead of feature indices. This also
+ # helps with pandas input with set feature names.
+ booster.feature_names = [str(i) for i in range(booster.num_features())]
+
+ if xgb_config is None:
+ xgb_config = get_xgboost_params(booster)
+
+ n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"])
+ n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"])
+ base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"])
+
+ is_regression = False
+ objective_fun = xgb_config["learner"]["learner_train_param"]["objective"]
+ if n_classes > 2:
+ if objective_fun not in ["multi:softprob", "multi:softmax"]:
+ raise TypeError(
+ "multi:softprob and multi:softmax are only supported for multiclass classification"
+ )
+ elif objective_fun.startswith("binary:"):
+ if objective_fun not in ["binary:logistic", "binary:logitraw"]:
+ raise TypeError(
+ "only binary:logistic and binary:logitraw are supported for binary classification"
+ )
+ n_classes = 2
+ if objective_fun == "binary:logitraw":
+ # daal4py always applies a sigmoid for pred_proba, wheres XGBoost
+ # returns raw predictions with logitraw
+ warn(
+ "objective='binary:logitraw' selected\n"
+ "XGBoost returns raw class scores when calling pred_proba()\n"
+ "whilst scikit-learn-intelex always uses binary:logistic\n"
+ )
+ if base_score != 0.5:
+ warn("objective='binary:logitraw' ignores base_score, fixing base_score to 0.5")
+ base_score = 0.5
+ else:
+ is_regression = True
+
+ # max_trees=0 if best_iteration does not exist
+ max_trees = getattr(booster, "best_iteration", -1) + 1
+ if n_classes > 2:
+ max_trees *= n_classes
+ tree_list = TreeList.from_xgb_booster(booster, max_trees)
+
+ if hasattr(booster, "best_iteration"):
+ n_iterations = booster.best_iteration + 1
+ else:
+ n_iterations = len(tree_list) // (n_classes if n_classes > 2 else 1)
+
+ return get_gbt_model_from_tree_list(
+ tree_list,
+ n_iterations=n_iterations,
+ is_regression=is_regression,
+ n_features=n_features,
+ n_classes=n_classes,
+ base_score=base_score,
+ )
+
+
+def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any:
+ if not model.is_fitted():
+ raise RuntimeError("Model should be fitted before exporting to daal4py.")
+
+ if model_data is None:
+ model_data = get_catboost_params(model)
+
+ if "categorical_features" in model_data["features_info"]:
+ raise NotImplementedError(
+ "Categorical features are not supported in daal4py Gradient Boosting Trees"
+ )
+
+ n_features = len(model_data["features_info"]["float_features"])
+
+ is_symmetric_tree = (
+ model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"]
+ == "SymmetricTree"
+ )
+
+ if is_symmetric_tree:
+ n_iterations = len(model_data["oblivious_trees"])
+ else:
+ n_iterations = len(model_data["trees"])
+
+ n_classes = 0
+
+ if "class_params" in model_data["model_info"]:
+ is_classification = True
+ n_classes = len(model_data["model_info"]["class_params"]["class_to_label"])
+ mb = gbt_clf_model_builder(
+ n_features=n_features, n_iterations=n_iterations, n_classes=n_classes
+ )
+ else:
+ is_classification = False
+ mb = gbt_reg_model_builder(n_features, n_iterations)
+
+ splits = []
+
+ # Create splits array (all splits are placed sequentially)
+ for feature in model_data["features_info"]["float_features"]:
+ if feature["borders"]:
+ for feature_border in feature["borders"]:
+ splits.append(
+ {"feature_index": feature["feature_index"], "value": feature_border}
+ )
+
+ if not is_classification:
+ bias = model_data["scale_and_bias"][1][0] / n_iterations
+ scale = model_data["scale_and_bias"][0]
+ else:
+ bias = 0
+ scale = 1
+
+ trees_explicit = []
+ tree_symmetric = []
+
+ if (
+ model_data["model_info"]["params"]["data_processing_options"][
+ "float_features_binarization"
+ ]["nan_mode"]
+ == "Min"
+ ):
+ default_left = 1
+ else:
+ default_left = 0
+
+ for tree_num in range(n_iterations):
+ if is_symmetric_tree:
+ if model_data["oblivious_trees"][tree_num]["splits"] is not None:
+ # Tree has more than 1 node
+ cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"])
+ else:
+ cur_tree_depth = 0
+
+ tree_symmetric.append(
+ (model_data["oblivious_trees"][tree_num], cur_tree_depth)
+ )
+ else:
+ n_nodes = 1
+ # Check if node is a leaf (in case of stump)
+ if "split" in model_data["trees"][tree_num]:
+ # Get number of trees and splits info via BFS
+ # Create queue
+ nodes_queue = []
+ root_node = CatBoostNode(
+ split=splits[model_data["trees"][tree_num]["split"]["split_index"]]
+ )
+ nodes_queue.append((model_data["trees"][tree_num], root_node))
+ while nodes_queue:
+ cur_node_data, cur_node = nodes_queue.pop(0)
+ if "value" in cur_node_data:
+ if isinstance(cur_node_data["value"], list):
+ cur_node.value = [value for value in cur_node_data["value"]]
+ else:
+ cur_node.value = [cur_node_data["value"] * scale + bias]
+ else:
+ cur_node.split = splits[cur_node_data["split"]["split_index"]]
+ left_node = CatBoostNode()
+ right_node = CatBoostNode()
+ cur_node.left = left_node
+ cur_node.right = right_node
+ nodes_queue.append((cur_node_data["left"], left_node))
+ nodes_queue.append((cur_node_data["right"], right_node))
+ n_nodes += 2
+ else:
+ root_node = CatBoostNode()
+ if is_classification and n_classes > 2:
+ root_node.value = [
+ value * scale for value in model_data["trees"][tree_num]["value"]
+ ]
+ else:
+ root_node.value = [
+ model_data["trees"][tree_num]["value"] * scale + bias
+ ]
+ trees_explicit.append((root_node, n_nodes))
+
+ tree_id = []
+ class_label = 0
+ count = 0
+
+ # Only 1 tree for each iteration in case of regression or binary classification
+ if not is_classification or n_classes == 2:
+ n_tree_each_iter = 1
+ else:
+ n_tree_each_iter = n_classes
+
+ # Create id for trees (for the right order in modelbuilder)
+ for i in range(n_iterations):
+ for c in range(n_tree_each_iter):
+ if is_symmetric_tree:
+ n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1
+ else:
+ n_nodes = trees_explicit[i][1]
+
+ if is_classification and n_classes > 2:
+ tree_id.append(mb.create_tree(n_nodes, class_label))
+ count += 1
+ if count == n_iterations:
+ class_label += 1
+ count = 0
+
+ elif is_classification:
+ tree_id.append(mb.create_tree(n_nodes, 0))
+ else:
+ tree_id.append(mb.create_tree(n_nodes))
+
+ if is_symmetric_tree:
+ for class_label in range(n_tree_each_iter):
+ for i in range(n_iterations):
+ cur_tree_info = tree_symmetric[i][0]
+ cur_tree_id = tree_id[i * n_tree_each_iter + class_label]
+ cur_tree_leaf_val = cur_tree_info["leaf_values"]
+ cur_tree_depth = tree_symmetric[i][1]
+
+ if cur_tree_depth == 0:
+ mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0])
+ else:
+ # One split used for the whole level
+ cur_level_split = splits[
+ cur_tree_info["splits"][cur_tree_depth - 1]["split_index"]
+ ]
+ root_id = mb.add_split(
+ tree_id=cur_tree_id,
+ feature_index=cur_level_split["feature_index"],
+ feature_value=cur_level_split["value"],
+ default_left=default_left,
+ cover=0.0,
+ )
+ prev_level_nodes = [root_id]
+
+ # Iterate over levels, splits in json are reversed (root split is the last)
+ for cur_level in range(cur_tree_depth - 2, -1, -1):
+ cur_level_nodes = []
+ for cur_parent in prev_level_nodes:
+ cur_level_split = splits[
+ cur_tree_info["splits"][cur_level]["split_index"]
+ ]
+ cur_left_node = mb.add_split(
+ tree_id=cur_tree_id,
+ parent_id=cur_parent,
+ position=0,
+ feature_index=cur_level_split["feature_index"],
+ feature_value=cur_level_split["value"],
+ default_left=default_left,
+ cover=0.0,
+ )
+ cur_right_node = mb.add_split(
+ tree_id=cur_tree_id,
+ parent_id=cur_parent,
+ position=1,
+ feature_index=cur_level_split["feature_index"],
+ feature_value=cur_level_split["value"],
+ default_left=default_left,
+ cover=0.0,
+ )
+ cur_level_nodes.append(cur_left_node)
+ cur_level_nodes.append(cur_right_node)
+ prev_level_nodes = cur_level_nodes
+
+ # Different storing format for leaves
+ if not is_classification or n_classes == 2:
+ for last_level_node_num in range(len(prev_level_nodes)):
+ mb.add_leaf(
+ tree_id=cur_tree_id,
+ response=cur_tree_leaf_val[2 * last_level_node_num]
+ * scale
+ + bias,
+ parent_id=prev_level_nodes[last_level_node_num],
+ position=0,
+ cover=0.0,
+ )
+ mb.add_leaf(
+ tree_id=cur_tree_id,
+ response=cur_tree_leaf_val[2 * last_level_node_num + 1]
+ * scale
+ + bias,
+ parent_id=prev_level_nodes[last_level_node_num],
+ position=1,
+ cover=0.0,
+ )
+ else:
+ for last_level_node_num in range(len(prev_level_nodes)):
+ left_index = (
+ 2 * last_level_node_num * n_tree_each_iter + class_label
+ )
+ right_index = (
+ 2 * last_level_node_num + 1
+ ) * n_tree_each_iter + class_label
+ mb.add_leaf(
+ tree_id=cur_tree_id,
+ response=cur_tree_leaf_val[left_index] * scale + bias,
+ parent_id=prev_level_nodes[last_level_node_num],
+ position=0,
+ cover=0.0,
+ )
+ mb.add_leaf(
+ tree_id=cur_tree_id,
+ response=cur_tree_leaf_val[right_index] * scale + bias,
+ parent_id=prev_level_nodes[last_level_node_num],
+ position=1,
+ cover=0.0,
+ )
+ else:
+ for class_label in range(n_tree_each_iter):
+ for i in range(n_iterations):
+ root_node = trees_explicit[i][0]
+
+ cur_tree_id = tree_id[i * n_tree_each_iter + class_label]
+ # Traverse tree via BFS and build tree with modelbuilder
+ if root_node.value is None:
+ root_id = mb.add_split(
+ tree_id=cur_tree_id,
+ feature_index=root_node.split["feature_index"],
+ feature_value=root_node.split["value"],
+ default_left=default_left,
+ cover=0.0,
+ )
+ nodes_queue = [(root_node, root_id)]
+ while nodes_queue:
+ cur_node, cur_node_id = nodes_queue.pop(0)
+ left_node = cur_node.left
+ # Check if node is a leaf
+ if left_node.value is None:
+ left_node_id = mb.add_split(
+ tree_id=cur_tree_id,
+ parent_id=cur_node_id,
+ position=0,
+ feature_index=left_node.split["feature_index"],
+ feature_value=left_node.split["value"],
+ default_left=default_left,
+ cover=0.0,
+ )
+ nodes_queue.append((left_node, left_node_id))
+ else:
+ mb.add_leaf(
+ tree_id=cur_tree_id,
+ response=left_node.value[class_label],
+ parent_id=cur_node_id,
+ position=0,
+ cover=0.0,
+ )
+ right_node = cur_node.right
+ # Check if node is a leaf
+ if right_node.value is None:
+ right_node_id = mb.add_split(
+ tree_id=cur_tree_id,
+ parent_id=cur_node_id,
+ position=1,
+ feature_index=right_node.split["feature_index"],
+ feature_value=right_node.split["value"],
+ default_left=default_left,
+ cover=0.0,
+ )
+ nodes_queue.append((right_node, right_node_id))
+ else:
+ mb.add_leaf(
+ tree_id=cur_tree_id,
+ response=cur_node.right.value[class_label],
+ parent_id=cur_node_id,
+ position=1,
+ cover=0.0,
+ )
+
+ else:
+ # Tree has only one node
+ mb.add_leaf(
+ tree_id=cur_tree_id,
+ response=root_node.value[class_label],
+ cover=0.0,
+ )
+
+ warn("Models converted from CatBoost cannot be used for SHAP value calculation")
+ return mb.model(0.0)
diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h
index c906a0a537..7a99b07b8c 100644
--- a/src/gbt_model_builder.h
+++ b/src/gbt_model_builder.h
@@ -22,10 +22,16 @@
#include
#include "onedal/version.hpp"
-#if (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023))
-#define _gbt_inference_has_missing_values_support 1
+#if (((MAJOR_VERSION == 2024) && (MINOR_VERSION == 0) && (UPDATE_VERSION >= 1)) || ((MAJOR_VERSION > 2024) && (MINOR_VERSION >= 1)))
+ // added missing value support to GBT regression
+ // added SHAP value support
+ // added base_score parameter
+ #define _gbt_inference_api_version 2
+#elif (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023))
+ // added missing value support to GBT classification
+ #define _gbt_inference_api_version 1
#else
-#define _gbt_inference_has_missing_values_support 0
+ #define _gbt_inference_api_version 0
#endif
typedef daal::algorithms::gbt::classification::ModelBuilder c_gbt_classification_model_builder;
@@ -39,32 +45,61 @@ typedef c_gbt_regression_model_builder::TreeId c_gbt_reg_tree_id;
#define c_gbt_clf_no_parent c_gbt_classification_model_builder::noParent
#define c_gbt_reg_no_parent c_gbt_regression_model_builder::noParent
-static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_)
+static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_, double base_score)
{
- return RAW()(obj_->getModel());
+ daal::algorithms::gbt::classification::ModelPtr * ptr = RAW()(obj_->getModel());
+#if (_gbt_inference_api_version == 2)
+ ptr->get()->setPredictionBias(base_score);
+#endif
+ return ptr;
}
-
-static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_builder_model(daal::algorithms::gbt::regression::ModelBuilder * obj_)
+static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_builder_model(daal::algorithms::gbt::regression::ModelBuilder * obj_, double base_score)
{
- return RAW()(obj_->getModel());
+ daal::algorithms::gbt::regression::ModelPtr * ptr = RAW()(obj_->getModel());
+#if (_gbt_inference_api_version == 2)
+ ptr->get()->setPredictionBias(base_score);
+#endif
+ return ptr;
}
-c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft)
+c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover)
{
-#if _gbt_inference_has_missing_values_support
+#if (_gbt_inference_api_version == 2)
+ return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft, cover);
+#elif (_gbt_inference_api_version == 1)
return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft);
#else
return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue);
#endif
}
-c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft)
+c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover)
{
-#if _gbt_inference_has_missing_values_support
+#if (_gbt_inference_api_version == 2)
+ return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft, cover);
+#elif (_gbt_inference_api_version == 1)
return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft);
#else
return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue);
#endif
}
+c_gbt_clf_node_id clfAddLeafNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover)
+{
+#if (_gbt_inference_api_version == 2)
+ return c_ptr->addLeafNode(treeId, parentId, position, response, cover);
+#else
+ return c_ptr->addLeafNode(treeId, parentId, position, response);
+#endif
+}
+
+c_gbt_reg_node_id regAddLeafNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover)
+{
+#if (_gbt_inference_api_version == 2)
+ return c_ptr->addLeafNode(treeId, parentId, position, response, cover);
+#else
+ return c_ptr->addLeafNode(treeId, parentId, position, response);
+#endif
+}
+
#endif // _GBT_MODEL_BUILDER_INCLUDED_
diff --git a/src/gbt_model_builder.pyx b/src/gbt_model_builder.pyx
index f46264ed94..418390a4ec 100644
--- a/src/gbt_model_builder.pyx
+++ b/src/gbt_model_builder.pyx
@@ -27,21 +27,24 @@ cdef extern from "gbt_model_builder.h":
cdef size_t c_gbt_clf_no_parent
cdef size_t c_gbt_reg_no_parent
- cdef gbt_classification_ModelPtr * get_gbt_classification_model_builder_model(c_gbt_classification_model_builder *)
- cdef gbt_regression_ModelPtr * get_gbt_regression_model_builder_model(c_gbt_regression_model_builder *)
+ cdef gbt_classification_ModelPtr * get_gbt_classification_model_builder_model(c_gbt_classification_model_builder *, double base_score)
+ cdef gbt_regression_ModelPtr * get_gbt_regression_model_builder_model(c_gbt_regression_model_builder *, double base_score)
cdef cppclass c_gbt_classification_model_builder:
c_gbt_classification_model_builder(size_t nFeatures, size_t nIterations, size_t nClasses) except +
c_gbt_clf_tree_id createTree(size_t nNodes, size_t classLabel)
- c_gbt_clf_node_id addLeafNode(c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response)
+ c_gbt_clf_node_id addLeafNode(c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover)
cdef cppclass c_gbt_regression_model_builder:
c_gbt_regression_model_builder(size_t nFeatures, size_t nIterations) except +
c_gbt_reg_tree_id createTree(size_t nNodes)
- c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response)
+ c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover)
- cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft)
- cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft)
+ cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover)
+ cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover)
+
+ cdef c_gbt_clf_node_id clfAddLeafNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover)
+ cdef c_gbt_clf_node_id regAddLeafNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover)
cdef class gbt_classification_model_builder:
'''
@@ -65,7 +68,7 @@ cdef class gbt_classification_model_builder:
'''
return self.c_ptr.createTree(n_nodes, class_label)
- def add_leaf(self, c_gbt_clf_tree_id tree_id, double response, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0):
+ def add_leaf(self, c_gbt_clf_tree_id tree_id, double response, double cover, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0):
'''
Create Leaf node and add it to certain tree
@@ -73,11 +76,12 @@ cdef class gbt_classification_model_builder:
:param node-handle parent_id: parent node to which new node is added (use noParent for root node)
:param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree)
:param double response: response value for leaf node to be predicted
+ :param double cover: cover (sum_hess) of the leaf node
:rtype: node identifier
'''
- return self.c_ptr.addLeafNode(tree_id, parent_id, position, response)
+ return clfAddLeafNodeWrapper(self.c_ptr, tree_id, parent_id, position, response, cover)
- def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, int default_left, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0):
+ def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0):
'''
Create Split node and add it to certain tree.
@@ -87,18 +91,20 @@ cdef class gbt_classification_model_builder:
:param size_t feature_index: feature index for spliting
:param double feature_value: feature value for spliting
:param int default_left: default behaviour in case of missing value
+ :param double cover: cover (sum_hess) of the solit node
:rtype: node identifier
'''
- return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left)
+ return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover)
- def model(self):
+ def model(self, base_score):
'''
Get built model
+ :param double base_score: global prediction bias (used e.g. in XGBoost)
:rtype: gbt_classification_model
'''
cdef gbt_classification_model res = gbt_classification_model.__new__(gbt_classification_model)
- res.c_ptr = get_gbt_classification_model_builder_model(self.c_ptr)
+ res.c_ptr = get_gbt_classification_model_builder_model(self.c_ptr, base_score or 0.0)
return res
@@ -123,7 +129,7 @@ cdef class gbt_regression_model_builder:
'''
return self.c_ptr.createTree(n_nodes)
- def add_leaf(self, c_gbt_reg_tree_id tree_id, double response, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0):
+ def add_leaf(self, c_gbt_reg_tree_id tree_id, double response, double cover, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0):
'''
Create Leaf node and add it to certain tree
@@ -131,11 +137,12 @@ cdef class gbt_regression_model_builder:
:param node-handle parent_id: parent node to which new node is added (use noParent for root node)
:param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree)
:param double response: response value for leaf node to be predicted
+ :param double cover: cover (sum_hess) of the leaf node
:rtype: node identifier
'''
- return self.c_ptr.addLeafNode(tree_id, parent_id, position, response)
+ return regAddLeafNodeWrapper(self.c_ptr, tree_id, parent_id, position, response, cover)
- def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, int default_left, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0):
+ def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0):
'''
Create Split node and add it to certain tree.
@@ -144,19 +151,21 @@ cdef class gbt_regression_model_builder:
:param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree)
:param size_t feature_index: feature index for spliting
:param double feature_value: feature value for spliting
+ :param double cover: cover (sum_hess) of the split node
:param int default_left: default behaviour in case of missing value
:rtype: node identifier
'''
- return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left)
+ return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover)
- def model(self):
+ def model(self, base_score):
'''
Get built model
+ :param double base_score: global prediction bias (used e.g. in XGBoost)
:rtype: gbt_regression_model
'''
cdef gbt_regression_model res = gbt_regression_model.__new__(gbt_regression_model)
- res.c_ptr = get_gbt_regression_model_builder_model(self.c_ptr)
+ res.c_ptr = get_gbt_regression_model_builder_model(self.c_ptr, base_score or 0.0)
return res
diff --git a/tests/test_examples.py b/tests/test_examples.py
index a9e8adaf23..c509416148 100755
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -19,9 +19,11 @@
test_path = os.path.abspath(os.path.dirname(__file__))
unittest_data_path = os.path.join(test_path, "unittest_data")
-examples_path = os.path.join(os.path.dirname(test_path), "examples", "daal4py")
-sys.path.insert(0, examples_path)
-os.chdir(examples_path)
+daal4py_examples_path = os.path.join(os.path.dirname(test_path), "examples", "daal4py")
+mb_examples_path = os.path.join(os.path.dirname(test_path), "examples", "mb")
+sys.path.insert(0, daal4py_examples_path)
+sys.path.insert(0, mb_examples_path)
+os.chdir(daal4py_examples_path)
import unittest
@@ -270,6 +272,13 @@ def test_svm(self):
((2020, "P", 2), (2021, "B", 109)),
["xgboost"],
),
+ (
+ "model_builders_xgboost_shap",
+ None,
+ None,
+ (2024, "P", 1),
+ ["xgboost"],
+ ),
("model_builders_catboost", None, None, (2021, "P", 4), ["catboost"]),
("gradient_boosted_classification",),
("gradient_boosted_regression",),
diff --git a/tests/test_logistic_regression_model_builder.py b/tests/test_logistic_regression_model_builder.py
deleted file mode 100644
index 3a28677743..0000000000
--- a/tests/test_logistic_regression_model_builder.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# ==============================================================================
-# Copyright 2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import unittest
-
-import numpy as np
-from sklearn.datasets import load_breast_cancer, load_iris
-from sklearn.linear_model import LogisticRegression
-
-import daal4py as d4p
-from daal4py import _get__daal_link_version__ as dv
-from daal4py.sklearn._utils import daal_check_version
-
-# First item is major version - 2021,
-# second is minor+patch - 0110,
-# third item is status - B
-daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8]))
-reason = str(((2021, "P", 1))) + " not supported in this library version "
-reason += str(daal_version)
-
-
-class LogRegModelBuilder(unittest.TestCase):
- @unittest.skipUnless(
- all(
- [
- hasattr(d4p, "logistic_regression_model_builder"),
- daal_check_version(((2021, "P", 1))),
- ]
- ),
- reason,
- )
- def test_iris_with_intercept(self):
- X, y = load_iris(return_X_y=True)
- n_classes = 3
- clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit(
- X, y
- )
- builder = d4p.logistic_regression_model_builder(
- n_classes=n_classes, n_features=X.shape[1]
- )
- builder.set_beta(clf.coef_, clf.intercept_)
-
- alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
-
- pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
- pred_sklearn = clf.predict(X)
- self.assertTrue(np.allclose(pred_daal, pred_sklearn))
-
- @unittest.skipUnless(
- all(
- [
- hasattr(d4p, "logistic_regression_model_builder"),
- daal_check_version(((2021, "P", 1))),
- ]
- ),
- reason,
- )
- def test_iris_without_intercept(self):
- X, y = load_iris(return_X_y=True)
- n_classes = 3
- clf = LogisticRegression(fit_intercept=False, max_iter=1000, random_state=0).fit(
- X, y
- )
- builder = d4p.logistic_regression_model_builder(
- n_classes=n_classes, n_features=X.shape[1]
- )
- builder.set_beta(clf.coef_, clf.intercept_)
-
- alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
-
- pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
- pred_sklearn = clf.predict(X)
- self.assertTrue(np.allclose(pred_daal, pred_sklearn))
-
- @unittest.skipUnless(
- all(
- [
- hasattr(d4p, "logistic_regression_model_builder"),
- daal_check_version(((2021, "P", 1))),
- ]
- ),
- reason,
- )
- def test_breast_cancer_with_intercept(self):
- X, y = load_breast_cancer(return_X_y=True)
- n_classes = 2
- clf = LogisticRegression(fit_intercept=True, max_iter=10000, random_state=0).fit(
- X, y
- )
- builder = d4p.logistic_regression_model_builder(
- n_classes=n_classes, n_features=X.shape[1]
- )
- builder.set_beta(clf.coef_, clf.intercept_)
-
- alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
-
- pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
- pred_sklearn = clf.predict(X)
- self.assertTrue(np.allclose(pred_daal, pred_sklearn))
-
- @unittest.skipUnless(
- all(
- [
- hasattr(d4p, "logistic_regression_model_builder"),
- daal_check_version(((2021, "P", 1))),
- ]
- ),
- reason,
- )
- def test_breast_cancer_without_intercept(self):
- X, y = load_breast_cancer(return_X_y=True)
- n_classes = 2
- clf = LogisticRegression(fit_intercept=False, max_iter=10000, random_state=0).fit(
- X, y
- )
- builder = d4p.logistic_regression_model_builder(
- n_classes=n_classes, n_features=X.shape[1]
- )
- builder.set_beta(clf.coef_, clf.intercept_)
-
- alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
-
- pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
- pred_sklearn = clf.predict(X)
- self.assertTrue(np.allclose(pred_daal, pred_sklearn))
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py
new file mode 100644
index 0000000000..93b39fd77e
--- /dev/null
+++ b/tests/test_model_builders.py
@@ -0,0 +1,780 @@
+# ==============================================================================
+# Copyright 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import unittest
+
+import lightgbm as lgbm
+import numpy as np
+import xgboost as xgb
+from sklearn.datasets import (
+ load_breast_cancer,
+ load_iris,
+ make_classification,
+ make_regression,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+
+import daal4py as d4p
+from daal4py.sklearn._utils import daal_check_version
+
+try:
+ import catboost as cb
+
+ cb_available = True
+except ImportError:
+ cb_available = False
+
+try:
+ import shap
+
+ shap_available = True
+except ImportError:
+ shap_available = False
+
+
+shap_required_version = (2024, "P", 1)
+shap_supported = daal_check_version(shap_required_version)
+shap_not_supported_str = (
+ f"SHAP value calculation only supported for version {shap_required_version} or later"
+)
+shap_unavailable_str = "SHAP Python package not available"
+cb_unavailable_str = "CatBoost not available"
+
+
+class LogRegModelBuilder(unittest.TestCase):
+ def test_iris_with_intercept(self):
+ X, y = load_iris(return_X_y=True)
+ n_classes = 3
+ clf = LogisticRegression(fit_intercept=True, max_iter=1000, random_state=0).fit(
+ X, y
+ )
+ builder = d4p.logistic_regression_model_builder(
+ n_classes=n_classes, n_features=X.shape[1]
+ )
+ builder.set_beta(clf.coef_, clf.intercept_)
+
+ alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
+
+ pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
+ pred_sklearn = clf.predict(X)
+ self.assertTrue(np.allclose(pred_daal, pred_sklearn))
+
+ def test_iris_without_intercept(self):
+ X, y = load_iris(return_X_y=True)
+ n_classes = 3
+ clf = LogisticRegression(fit_intercept=False, max_iter=1000, random_state=0).fit(
+ X, y
+ )
+ builder = d4p.logistic_regression_model_builder(
+ n_classes=n_classes, n_features=X.shape[1]
+ )
+ builder.set_beta(clf.coef_, clf.intercept_)
+
+ alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
+
+ pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
+ pred_sklearn = clf.predict(X)
+ self.assertTrue(np.allclose(pred_daal, pred_sklearn))
+
+ def test_breast_cancer_with_intercept(self):
+ X, y = load_breast_cancer(return_X_y=True)
+ n_classes = 2
+ clf = LogisticRegression(fit_intercept=True, max_iter=10000, random_state=0).fit(
+ X, y
+ )
+ builder = d4p.logistic_regression_model_builder(
+ n_classes=n_classes, n_features=X.shape[1]
+ )
+ builder.set_beta(clf.coef_, clf.intercept_)
+
+ alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
+
+ pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
+ pred_sklearn = clf.predict(X)
+ self.assertTrue(np.allclose(pred_daal, pred_sklearn))
+
+ def test_breast_cancer_without_intercept(self):
+ X, y = load_breast_cancer(return_X_y=True)
+ n_classes = 2
+ clf = LogisticRegression(fit_intercept=False, max_iter=10000, random_state=0).fit(
+ X, y
+ )
+ builder = d4p.logistic_regression_model_builder(
+ n_classes=n_classes, n_features=X.shape[1]
+ )
+ builder.set_beta(clf.coef_, clf.intercept_)
+
+ alg_pred = d4p.logistic_regression_prediction(nClasses=n_classes)
+
+ pred_daal = alg_pred.compute(X, builder.model).prediction.flatten()
+ pred_sklearn = clf.predict(X)
+ self.assertTrue(np.allclose(pred_daal, pred_sklearn))
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostRegressionModelBuilder(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls, base_score=0.5):
+ X, y = make_regression(n_samples=2, n_features=10, random_state=42)
+ cls.X_test = X[:2, :]
+ cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10)
+ cls.xgb_model = xgb.XGBRegressor(
+ max_depth=5, n_estimators=50, random_state=42, base_score=base_score
+ )
+ cls.xgb_model.fit(X, y)
+
+ def test_model_conversion(self):
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ # XGBoost treats regression as 0 classes, LightGBM 1 class
+ # For us, it does not make a difference and both are acceptable
+ self.assertEqual(m.n_classes_, 0)
+ self.assertEqual(m.n_features_in_, 10)
+ self.assertTrue(m._is_regression)
+
+ def test_model_predict(self):
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ d4p_pred = m.predict(self.X_test)
+ xgboost_pred = self.xgb_model.predict(self.X_test)
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6)
+
+ def test_missing_value_support(self):
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ d4p_pred = m.predict(self.X_nan)
+ xgboost_pred = self.xgb_model.predict(self.X_nan)
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6)
+
+ def test_model_predict_shap_contribs(self):
+ booster = self.xgb_model.get_booster()
+ m = d4p.mb.convert_model(booster)
+ d4p_pred = m.predict(self.X_test, pred_contribs=True)
+ xgboost_pred = booster.predict(
+ xgb.DMatrix(self.X_test),
+ pred_contribs=True,
+ approx_contribs=False,
+ validate_features=False,
+ )
+ self.assertTrue(
+ d4p_pred.shape == xgboost_pred.shape,
+ f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {xgboost_pred.shape}",
+ )
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6)
+
+ def test_model_predict_shap_interactions(self):
+ booster = self.xgb_model.get_booster()
+ m = d4p.mb.convert_model(booster)
+ d4p_pred = m.predict(self.X_test, pred_interactions=True)
+ xgboost_pred = booster.predict(
+ xgb.DMatrix(self.X_test),
+ pred_interactions=True,
+ approx_contribs=False,
+ validate_features=False,
+ )
+ self.assertTrue(
+ d4p_pred.shape == xgboost_pred.shape,
+ f"d4p and reference SHAP interaction shape is different {d4p_pred.shape} != {xgboost_pred.shape}",
+ )
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6)
+
+ def test_model_predict_shap_contribs_missing_values(self):
+ booster = self.xgb_model.get_booster()
+ m = d4p.mb.convert_model(booster)
+ d4p_pred = m.predict(self.X_nan, pred_contribs=True)
+ xgboost_pred = booster.predict(
+ xgb.DMatrix(self.X_nan),
+ pred_contribs=True,
+ approx_contribs=False,
+ validate_features=False,
+ )
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=5e-6)
+
+
+# duplicate all tests for bae_score=0.0
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostRegressionModelBuilder_base_score0(XGBoostRegressionModelBuilder):
+ @classmethod
+ def setUpClass(cls):
+ XGBoostRegressionModelBuilder.setUpClass(0)
+
+
+# duplicate all tests for bae_score=100
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostRegressionModelBuilder_base_score100(XGBoostRegressionModelBuilder):
+ @classmethod
+ def setUpClass(cls):
+ XGBoostRegressionModelBuilder.setUpClass(100)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostClassificationModelBuilder(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls, base_score=0.5, n_classes=2, objective="binary:logistic"):
+ n_features = 15
+ cls.base_score = base_score
+ cls.n_classes = n_classes
+ X, y = make_classification(
+ n_samples=500,
+ n_classes=n_classes,
+ n_features=n_features,
+ n_informative=10,
+ random_state=42,
+ )
+ cls.X_test = X[:2, :]
+ cls.X_nan = np.array([np.nan] * 2 * n_features, dtype=np.float32).reshape(
+ 2, n_features
+ )
+ cls.xgb_model = xgb.XGBClassifier(
+ max_depth=5,
+ n_estimators=50,
+ random_state=42,
+ base_score=base_score,
+ objective=objective,
+ )
+ cls.xgb_model.fit(X, y)
+
+ def test_model_conversion(self):
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ self.assertEqual(m.n_classes_, self.n_classes)
+ self.assertEqual(m.n_features_in_, 15)
+ self.assertFalse(m._is_regression)
+
+ def test_model_predict(self):
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ d4p_pred = m.predict(self.X_test)
+ xgboost_pred = self.xgb_model.predict(self.X_test)
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7)
+
+ def test_model_predict_proba(self):
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ d4p_pred = m.predict_proba(self.X_test)
+ xgboost_pred = self.xgb_model.predict_proba(self.X_test)
+ # calculating probas involves multiple exp / ln operations, therefore
+ # they're quite susceptible to small numerical changes and we have to
+ # accept an rtol of 1e-5
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5)
+
+ def test_missing_value_support(self):
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ d4p_pred = m.predict(self.X_nan)
+ xgboost_pred = self.xgb_model.predict(self.X_nan)
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7)
+
+ def test_model_predict_shap_contribs(self):
+ booster = self.xgb_model.get_booster()
+ m = d4p.mb.convert_model(booster)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_test, pred_contribs=True)
+
+ def test_model_predict_shap_interactions(self):
+ booster = self.xgb_model.get_booster()
+ m = d4p.mb.convert_model(booster)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_test, pred_contribs=True)
+
+
+# duplicate all tests for bae_score=0.3
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostClassificationModelBuilder_base_score03(XGBoostClassificationModelBuilder):
+ @classmethod
+ def setUpClass(cls):
+ XGBoostClassificationModelBuilder.setUpClass(base_score=0.3)
+
+
+# duplicate all tests for bae_score=0.7
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostClassificationModelBuilder_base_score07(XGBoostClassificationModelBuilder):
+ @classmethod
+ def setUpClass(cls):
+ XGBoostClassificationModelBuilder.setUpClass(base_score=0.7)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostClassificationModelBuilder_n_classes5(XGBoostClassificationModelBuilder):
+ @classmethod
+ def setUpClass(cls):
+ XGBoostClassificationModelBuilder.setUpClass(n_classes=5)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostClassificationModelBuilder_n_classes5_base_score03(
+ XGBoostClassificationModelBuilder
+):
+ @classmethod
+ def setUpClass(cls):
+ XGBoostClassificationModelBuilder.setUpClass(n_classes=5, base_score=0.3)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostClassificationModelBuilder_objective_logitraw(
+ XGBoostClassificationModelBuilder
+):
+ @classmethod
+ def setUpClass(cls):
+ XGBoostClassificationModelBuilder.setUpClass(
+ base_score=0.5, n_classes=2, objective="binary:logitraw"
+ )
+
+ def test_model_predict_proba(self):
+ # overload this function because daal4py always applies the sigmoid
+ # for bias 0.5, we can still check if the original scores are correct
+ with self.assertWarns(UserWarning):
+ # expect a warning that logitraw behaves differently and/or
+ # that base_score is ignored / fixed to 0.5
+ m = d4p.mb.convert_model(self.xgb_model.get_booster())
+ d4p_pred = m.predict_proba(self.X_test)
+ # undo sigmoid
+ d4p_pred = np.log(-d4p_pred / (d4p_pred - 1))
+ # undo bias
+ d4p_pred += 0.5
+ xgboost_pred = self.xgb_model.predict_proba(self.X_test)
+ # calculating probas involves multiple exp / ln operations, therefore
+ # they're quite susceptible to small numerical changes and we have to
+ # accept an rtol of 1e-5
+ np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class LightGBMRegressionModelBuilder(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ X, y = make_regression(n_samples=100, n_features=10, random_state=42)
+ cls.X_test = X[:2, :]
+ cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10)
+ # LightGBM requires a couple of NaN values in the training data to properly set
+ # the missing value type to NaN
+ # https://github.com/microsoft/LightGBM/issues/6139
+ X_train = np.concatenate([cls.X_nan, X])
+ y_train = np.concatenate([[0, 0], y])
+ params = {
+ "task": "train",
+ "boosting": "gbdt",
+ "objective": "regression",
+ "num_leaves": 4,
+ "learning_rage": 0.05,
+ "metric": {"l2", "l1"},
+ "verbose": -1,
+ "n_estimators": 1,
+ }
+ cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train))
+
+ def test_model_conversion(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ self.assertEqual(m.n_classes_, 1)
+ self.assertEqual(m.n_features_in_, 10)
+ self.assertTrue(m._is_regression)
+
+ def test_model_predict(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_test)
+ lgbm_pred = self.lgbm_model.predict(self.X_test)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6)
+
+ def test_missing_value_support(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_nan)
+ lgbm_pred = self.lgbm_model.predict(self.X_nan)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=5e-6)
+
+ @unittest.skipUnless(shap_available, reason=shap_unavailable_str)
+ def test_model_predict_shap_contribs(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_test, pred_contribs=True)
+ explainer = shap.TreeExplainer(self.lgbm_model)
+ shap_pred = explainer(self.X_test).values
+ lgbm_pred = self.lgbm_model.predict(self.X_test, pred_contrib=True)
+ self.assertTrue(
+ d4p_pred.shape == lgbm_pred.shape,
+ f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}",
+ )
+ np.testing.assert_allclose(d4p_pred[:, :-1], shap_pred, rtol=1e-6)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6)
+
+ @unittest.skipUnless(shap_available, reason=shap_unavailable_str)
+ def test_model_predict_shap_interactions(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column
+ d4p_pred = m.predict(self.X_test, pred_interactions=True)[:, :-1, :-1]
+ explainer = shap.TreeExplainer(self.lgbm_model)
+ shap_pred = explainer.shap_interaction_values(self.X_test)
+ self.assertTrue(
+ d4p_pred.shape == shap_pred.shape,
+ f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {shap_pred.shape}",
+ )
+ np.testing.assert_allclose(d4p_pred, shap_pred, rtol=1e-6)
+
+ def test_model_predict_shap_contribs_missing_values(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_nan, pred_contribs=True)
+ lgbm_pred = self.lgbm_model.predict(self.X_nan, pred_contrib=True)
+ self.assertTrue(
+ d4p_pred.shape == lgbm_pred.shape,
+ f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}",
+ )
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class LightGBMClassificationModelBuilder(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ X, y = make_classification(
+ random_state=3, n_classes=3, n_informative=3, n_features=10
+ )
+ cls.X_test = X[:2, :]
+ cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10)
+ X_train = np.concatenate([cls.X_nan, X])
+ y_train = np.concatenate([[0, 0], y])
+ params = {
+ "n_estimators": 10,
+ "task": "train",
+ "boosting": "gbdt",
+ "objective": "multiclass",
+ "num_leaves": 4,
+ "num_class": 3,
+ "verbose": -1,
+ }
+ cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train))
+
+ def test_model_conversion(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ self.assertEqual(m.n_classes_, 3)
+ self.assertEqual(m.n_features_in_, 10)
+ self.assertFalse(m._is_regression)
+
+ def test_model_predict(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_test)
+ lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_model_predict_proba(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict_proba(self.X_test)
+ lgbm_pred = self.lgbm_model.predict(self.X_test)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_missing_value_support(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_nan)
+ lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_nan), axis=1)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_model_predict_shap_contribs(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_test, pred_contribs=True)
+
+ def test_model_predict_shap_interactions(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_test, pred_interactions=True)
+
+ def test_model_predict_shap_contribs_missing_values(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_nan, pred_contribs=True)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class LightGBMClassificationModelBuilder_binaryClassification(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ X, y = make_classification(
+ random_state=3, n_classes=2, n_informative=3, n_features=10
+ )
+ cls.X_test = X[:2, :]
+ cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10)
+ X_train = np.concatenate([cls.X_nan, X])
+ y_train = np.concatenate([[0, 0], y])
+ params = {
+ "n_estimators": 10,
+ "task": "train",
+ "boosting": "gbdt",
+ "objective": "binary",
+ "metric": "binary_logloss",
+ "num_leaves": 4,
+ "verbose": -1,
+ }
+ cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train))
+
+ def test_model_conversion(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ self.assertEqual(m.n_classes_, 2)
+ self.assertEqual(m.n_features_in_, 10)
+ self.assertFalse(m._is_regression)
+
+ def test_model_predict(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_test)
+ lgbm_pred = np.round(self.lgbm_model.predict(self.X_test)).astype(int)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_model_predict_proba(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ # predict proba of being class 1
+ d4p_pred = m.predict_proba(self.X_test)[:, 1]
+ lgbm_pred = self.lgbm_model.predict(self.X_test)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_missing_value_support(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ d4p_pred = m.predict(self.X_nan)
+ lgbm_pred = np.round(self.lgbm_model.predict(self.X_nan)).astype(int)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_model_predict_proba_missing_values(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ # predict proba of being class 1
+ d4p_pred = m.predict_proba(self.X_nan)[:, 1]
+ lgbm_pred = self.lgbm_model.predict(self.X_nan)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_model_predict_shap_contribs(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_test, pred_contribs=True)
+
+ def test_model_predict_shap_interactions(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_test, pred_interactions=True)
+
+ def test_model_predict_shap_contribs_missing_values(self):
+ m = d4p.mb.convert_model(self.lgbm_model)
+ with self.assertRaises(NotImplementedError):
+ m.predict(self.X_nan, pred_contribs=True)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+@unittest.skipUnless(cb_available, reason=cb_unavailable_str)
+class CatBoostRegressionModelBuilder(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ X, y = make_regression(n_samples=100, n_features=10, random_state=42)
+ cls.X_test = X[:2, :]
+ cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10)
+ params = {
+ "reg_lambda": 1,
+ "max_depth": 3,
+ "num_leaves": 2**3,
+ "verbose": 0,
+ "objective": "RMSE",
+ "learning_rate": 0.3,
+ "n_estimators": 25,
+ }
+ cls.cb_model = cb.CatBoost(params)
+ cls.cb_model.fit(X, y, verbose=0)
+
+ def test_model_conversion(self):
+ m = d4p.mb.convert_model(self.cb_model)
+ self.assertTrue(hasattr(m, "daal_model_"))
+ self.assertIsInstance(m.daal_model_, d4p._daal4py.gbt_regression_model)
+ self.assertEqual(m.daal_model_.NumberOfFeatures, 10)
+ self.assertEqual(m.daal_model_.NumberOfTrees, 25)
+ self.assertEqual(m.n_features_in_, 10)
+ self.assertTrue(m._is_regression)
+
+ def test_model_predict(self):
+ m = d4p.mb.convert_model(self.cb_model)
+ d4p_pred = m.predict(self.X_test)
+ lgbm_pred = self.cb_model.predict(self.X_test)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_missing_value_support(self):
+ m = d4p.mb.convert_model(self.cb_model)
+ d4p_pred = m.predict(self.X_nan)
+ lgbm_pred = self.cb_model.predict(self.X_nan)
+ np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7)
+
+ def test_model_predict_shap_contribs(self):
+ # SHAP value support from CatBoost models is to be added
+ with self.assertWarnsRegex(
+ Warning,
+ "Models converted from CatBoost cannot be used for SHAP value calculation",
+ ):
+ d4p.mb.convert_model(self.cb_model)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+@unittest.skipUnless(cb_available, reason=cb_unavailable_str)
+class CatBoostClassificationModelBuilder(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ X, y = make_classification(
+ n_classes=3, n_features=10, n_informative=3, random_state=42
+ )
+ cls.X_test = X[:2, :]
+ cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10)
+ params = {
+ "reg_lambda": 1,
+ "max_depth": 3,
+ "num_leaves": 2**3,
+ "verbose": 0,
+ "objective": "MultiClass",
+ "learning_rate": 0.3,
+ "n_estimators": 25,
+ }
+ cls.cb_model = cb.CatBoost(params)
+ cls.cb_model.fit(X, y, verbose=0)
+
+ def test_model_conversion(self):
+ m = d4p.mb.convert_model(self.cb_model)
+ self.assertTrue(hasattr(m, "daal_model_"))
+ self.assertIsInstance(m.daal_model_, d4p._daal4py.gbt_classification_model)
+ self.assertEqual(m.daal_model_.NumberOfFeatures, 10)
+ self.assertEqual(m.daal_model_.NumberOfTrees, 3 * 25)
+ self.assertEqual(m.n_features_in_, 10)
+ self.assertFalse(m._is_regression)
+
+ def test_model_predict(self):
+ m = d4p.mb.convert_model(self.cb_model)
+ d4p_pred = m.predict(self.X_test)
+ cb_pred = self.cb_model.predict(self.X_test, prediction_type="Class").T[0]
+ np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7)
+
+ def test_missing_value_support(self):
+ m = d4p.mb.convert_model(self.cb_model)
+ d4p_pred = m.predict(self.X_nan)
+ cb_pred = self.cb_model.predict(self.X_nan, prediction_type="Class").T[0]
+ np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7)
+
+ def test_model_predict_shap_contribs(self):
+ # SHAP value support from CatBoost models is to be added
+ with self.assertWarnsRegex(
+ Warning,
+ "Models converted from CatBoost cannot be used for SHAP value calculation",
+ ):
+ d4p.mb.convert_model(self.cb_model)
+
+
+@unittest.skipUnless(shap_supported, reason=shap_not_supported_str)
+class XGBoostEarlyStopping(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls) -> None:
+ num_classes = 3
+ X, y = make_classification(
+ n_samples=1500,
+ n_features=10,
+ n_informative=3,
+ n_classes=num_classes,
+ random_state=42,
+ )
+ X_train, cls.X_test, y_train, cls.y_test = train_test_split(
+ X, y, test_size=0.5, random_state=42
+ )
+
+ # training parameters setting
+ params = {
+ "n_estimators": 100,
+ "max_bin": 256,
+ "scale_pos_weight": 2,
+ "lambda_l2": 1,
+ "alpha": 0.9,
+ "max_depth": 8,
+ "num_leaves": 2**8,
+ "verbosity": 0,
+ "objective": "multi:softproba",
+ "learning_rate": 0.3,
+ "num_class": num_classes,
+ "early_stopping_rounds": 5,
+ "verbose_eval": False,
+ }
+
+ cls.xgb_clf = xgb.XGBClassifier(**params)
+ cls.xgb_clf.fit(
+ X_train, y_train, eval_set=[(cls.X_test, cls.y_test)], verbose=False
+ )
+ cls.daal_model = d4p.mb.convert_model(cls.xgb_clf.get_booster())
+
+ def test_early_stopping(self):
+ xgb_prediction = self.xgb_clf.predict(self.X_test)
+ xgb_proba = self.xgb_clf.predict_proba(self.X_test)
+ xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(self.y_test))
+
+ daal_prediction = self.daal_model.predict(self.X_test)
+ daal_proba = self.daal_model.predict_proba(self.X_test)
+ daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(self.y_test))
+
+ self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0)
+
+ np.testing.assert_allclose(xgb_proba, daal_proba, rtol=1e-6)
+
+
+class ModelBuilderTreeView(unittest.TestCase):
+ def test_model_from_booster(self):
+ class MockBooster:
+ def get_dump(self, *_, **kwargs):
+ # raw dump of 2 trees with a max depth of 1
+ return [
+ ' { "nodeid": 0, "depth": 0, "split": "1", "split_condition": 2, "yes": 1, "no": 2, "missing": 1 , "gain": 3, "cover": 4, "children": [\n { "nodeid": 1, "leaf": 5 , "cover": 6 }, \n { "nodeid": 2, "leaf": 7 , "cover":8 }\n ]}',
+ ' { "nodeid": 0, "leaf": 0.2 , "cover": 42 }',
+ ]
+
+ mock = MockBooster()
+ result = d4p.TreeList.from_xgb_booster(mock, max_trees=0)
+ self.assertEqual(len(result), 2)
+
+ tree0 = result[0]
+ self.assertIsInstance(tree0, d4p.TreeView)
+ self.assertFalse(tree0.is_leaf)
+ with self.assertRaises(ValueError):
+ tree0.cover
+ with self.assertRaises(ValueError):
+ tree0.value
+
+ self.assertIsInstance(tree0.root_node, d4p.Node)
+
+ self.assertEqual(tree0.root_node.cover, 4)
+ self.assertEqual(tree0.root_node.left_child.cover, 6)
+ self.assertEqual(tree0.root_node.right_child.cover, 8)
+
+ self.assertFalse(tree0.root_node.is_leaf)
+ self.assertTrue(tree0.root_node.left_child.is_leaf)
+ self.assertTrue(tree0.root_node.right_child.is_leaf)
+
+ self.assertTrue(tree0.root_node.default_left)
+ self.assertFalse(tree0.root_node.left_child.default_left)
+ self.assertFalse(tree0.root_node.right_child.default_left)
+
+ self.assertEqual(tree0.root_node.feature, 1)
+ with self.assertRaises(ValueError):
+ tree0.root_node.left_child.feature
+ with self.assertRaises(ValueError):
+ tree0.root_node.right_child.feature
+
+ self.assertEqual(tree0.root_node.value, 2)
+ self.assertEqual(tree0.root_node.left_child.value, 5)
+ self.assertEqual(tree0.root_node.right_child.value, 7)
+
+ self.assertEqual(tree0.root_node.n_children, 2)
+ self.assertEqual(tree0.root_node.left_child.n_children, 0)
+ self.assertEqual(tree0.root_node.right_child.n_children, 0)
+
+ self.assertIsNone(tree0.root_node.left_child.left_child)
+ self.assertIsNone(tree0.root_node.left_child.right_child)
+ self.assertIsNone(tree0.root_node.right_child.left_child)
+ self.assertIsNone(tree0.root_node.right_child.right_child)
+
+ tree1 = result[1]
+ self.assertIsInstance(tree1, d4p.TreeView)
+ self.assertTrue(tree1.is_leaf)
+ self.assertEqual(tree1.n_nodes, 1)
+ self.assertEqual(tree1.cover, 42)
+ self.assertEqual(tree1.value, 0.2)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_xgboost_mb.py b/tests/test_xgboost_mb.py
deleted file mode 100644
index 60ab1b9bdd..0000000000
--- a/tests/test_xgboost_mb.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# ==============================================================================
-# Copyright 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import importlib.util
-import unittest
-
-import numpy as np
-from sklearn.datasets import make_classification
-from sklearn.model_selection import train_test_split
-
-import daal4py as d4p
-from daal4py import _get__daal_link_version__ as dv
-from daal4py.sklearn._utils import daal_check_version
-
-# First item is major version - 2021,
-# second is minor+patch - 0110,
-# third item is status - B
-daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8]))
-reason = str(((2021, "P", 1))) + " not supported in this library version "
-reason += str(daal_version)
-
-
-class XgboostModelBuilder(unittest.TestCase):
- @unittest.skipUnless(
- all(
- [
- hasattr(d4p, "get_gbt_model_from_xgboost"),
- hasattr(d4p, "gbt_classification_prediction"),
- daal_check_version(((2021, "P", 1))),
- ]
- ),
- reason,
- )
- @unittest.skipUnless(
- importlib.util.find_spec("xgboost") is not None,
- "xgboost library is not installed",
- )
- def test_earlystop(self):
- import xgboost as xgb
-
- num_classes = 3
- X, y = make_classification(
- n_samples=1000,
- n_features=10,
- n_informative=3,
- n_classes=num_classes,
- random_state=42,
- )
- X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=0.3, random_state=42
- )
-
- # training parameters setting
- params = {
- "n_estimators": 100,
- "max_bin": 256,
- "scale_pos_weight": 2,
- "lambda_l2": 1,
- "alpha": 0.9,
- "max_depth": 8,
- "num_leaves": 2**8,
- "verbosity": 0,
- "objective": "multi:softproba",
- "learning_rate": 0.3,
- "num_class": num_classes,
- "early_stopping_rounds": 5,
- }
-
- xgb_clf = xgb.XGBClassifier(**params)
- xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
- booster = xgb_clf.get_booster()
-
- xgb_prediction = xgb_clf.predict(X_test)
- xgb_proba = xgb_clf.predict_proba(X_test)
- xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test))
-
- daal_model = d4p.mb.convert_model(booster)
-
- daal_prediction = daal_model.predict(X_test)
- daal_proba = daal_model.predict_proba(X_test)
- daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test))
-
- self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0)
- self.assertTrue(np.allclose(xgb_proba, daal_proba))
-
-
-if __name__ == "__main__":
- unittest.main()