From e991daea20cd2c6afe38a373d620c9b2df9d2346 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 27 Jul 2023 03:04:29 -0700 Subject: [PATCH 01/64] rename gbt_convertors.pyx -> *.py --- generator/gen_daal4py.py | 2 +- src/gbt_convertors.py | 502 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 503 insertions(+), 1 deletion(-) create mode 100755 src/gbt_convertors.py diff --git a/generator/gen_daal4py.py b/generator/gen_daal4py.py index c808aa25c8..defc159d41 100755 --- a/generator/gen_daal4py.py +++ b/generator/gen_daal4py.py @@ -1235,7 +1235,7 @@ def gen_daal4py(dalroot, outdir, version, warn_all=False, no_dist=False, no_stre ): with open(jp("src", "gbt_model_builder.pyx"), "r") as f: pyx_gbt_model_builder = f.read() - with open(jp("src", "gbt_convertors.pyx"), "r") as f: + with open(jp('src', 'gbt_convertors.py'), 'r') as f: pyx_gbt_generators = f.read() if ( "algorithms::logistic_regression" in iface.namespace_dict diff --git a/src/gbt_convertors.py b/src/gbt_convertors.py new file mode 100755 index 0000000000..de388f1c42 --- /dev/null +++ b/src/gbt_convertors.py @@ -0,0 +1,502 @@ +#=============================================================================== +# Copyright 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +from typing import List, Deque, Dict, Any +from collections import deque +from os import remove, getpid +import json +import re +from time import time + +def get_lightgbm_params(booster): + return booster.dump_model() + +def get_xgboost_params(booster): + return json.loads(booster.save_config()) + +def get_catboost_params(booster): + dump_filename = f"catboost_model_{getpid()}_{time()}" + + # Dump model in file + booster.save_model(dump_filename, 'json') + + # Read json with model + with open(dump_filename) as file: + model_data = json.load(file) + + # Delete dump file + remove(dump_filename) + return model_data + +def get_gbt_model_from_lightgbm(model: Any, lgb_model = None) -> Any: + class Node: + def __init__(self, tree: Dict[str, Any], parent_id: int, position: int): + self.tree = tree + self.parent_id = parent_id + self.position = position + + if lgb_model is None: + lgb_model = get_lightgbm_params(model) + + n_features = lgb_model["max_feature_idx"] + 1 + n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] + n_classes = lgb_model["num_tree_per_iteration"] + + is_regression = False + objective_fun = lgb_model["objective"] + if n_classes > 2: + if "multiclass" not in objective_fun: + raise TypeError( + "multiclass (softmax) objective is only supported for multiclass classification") + elif "binary" in objective_fun: # nClasses == 1 + n_classes = 2 + else: + is_regression = True + + if is_regression: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) + + class_label = 0 + iterations_counter = 0 + for tree in lgb_model["tree_info"]: + if is_regression: + tree_id = mb.create_tree(tree["num_leaves"]*2-1) + else: + tree_id = mb.create_tree(n_nodes=tree["num_leaves"]*2-1, class_label=class_label) + + iterations_counter += 1 + if iterations_counter == n_iterations: + iterations_counter = 0 + class_label += 1 + sub_tree = tree["tree_structure"] + + # root is leaf + if "leaf_value" in sub_tree: + mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) + continue + + # add root + feat_val = sub_tree["threshold"] + if isinstance(feat_val, str): + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees") + default_left = int(sub_tree["default_left"]) + parent_id = mb.add_split( + tree_id=tree_id, feature_index=sub_tree["split_feature"], + feature_value=feat_val, default_left=default_left) + + # create stack + node_stack: List[Node] = [Node(sub_tree["left_child"], parent_id, 0), + Node(sub_tree["right_child"], parent_id, 1)] + + # dfs through it + while node_stack: + sub_tree = node_stack[-1].tree + parent_id = node_stack[-1].parent_id + position = node_stack[-1].position + node_stack.pop() + + # current node is leaf + if "leaf_index" in sub_tree: + mb.add_leaf( + tree_id=tree_id, response=sub_tree["leaf_value"], + parent_id=parent_id, position=position) + continue + + # current node is split + feat_val = sub_tree["threshold"] + if isinstance(feat_val, str): + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees") + default_left = int(sub_tree["default_left"]) + parent_id = mb.add_split( + tree_id=tree_id, feature_index=sub_tree["split_feature"], + feature_value=feat_val, + default_left=default_left, + parent_id=parent_id, position=position) + + # append children + node_stack.append(Node(sub_tree["left_child"], parent_id, 0)) + node_stack.append(Node(sub_tree["right_child"], parent_id, 1)) + + return mb.model() + + +def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: + class Node: + def __init__(self, tree: Dict, parent_id: int, position: int): + self.tree = tree + self.parent_id = parent_id + self.position = position + + # Release Note for XGBoost 1.5.0: Python interface now supports configuring + # constraints using feature names instead of feature indices. This also + # helps with pandas input with set feature names. + lst = [*range(booster.num_features())] + booster.feature_names = [str(i) for i in lst] + + if xgb_config is None: + xgb_config = get_xgboost_params(booster) + + n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) + n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) + base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) + + is_regression = False + objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] + if n_classes > 2: + if objective_fun not in ["multi:softprob", "multi:softmax"]: + raise TypeError( + "multi:softprob and multi:softmax are only supported for multiclass classification") + elif objective_fun.find("binary:") == 0: + if objective_fun in ["binary:logistic", "binary:logitraw"]: + n_classes = 2 + else: + raise TypeError( + "binary:logistic and binary:logitraw are only supported for binary classification") + else: + is_regression = True + + n_iterations = booster.best_iteration + 1 + booster_dump = booster.get_dump(dump_format="json") + trees_arr = booster_dump[: n_iterations * (n_classes if n_classes > 2 else 1)] + + # Create + base iteration + if is_regression: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) + + tree_id = mb.create_tree(1) + mb.add_leaf(tree_id=tree_id, response=base_score) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) + + class_label = 0 + iterations_counter = 0 + mis_eq_yes = None + for tree in trees_arr: + n_nodes = 1 + # find out the number of nodes in the tree + for node in tree.split("nodeid")[1:]: + node_id = int(node[3:node.find(",")]) + if node_id + 1 > n_nodes: + n_nodes = node_id + 1 + if is_regression: + tree_id = mb.create_tree(n_nodes) + else: + tree_id = mb.create_tree(n_nodes=n_nodes, class_label=class_label) + + iterations_counter += 1 + if iterations_counter == n_iterations: + iterations_counter = 0 + class_label += 1 + sub_tree = json.loads(tree) + + # root is leaf + if "leaf" in sub_tree: + mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf"]) + continue + + # add root + try: + feature_index = int(sub_tree["split"]) + except ValueError: + raise TypeError("Feature names must be integers") + feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) + default_left = int(sub_tree["yes"] == sub_tree["missing"]) + parent_id = mb.add_split(tree_id=tree_id, feature_index=feature_index, + feature_value=feature_value, default_left=default_left) + + # create queue + node_queue: Deque[Node] = deque() + node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) + node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) + + # bfs through it + while node_queue: + sub_tree = node_queue[0].tree + parent_id = node_queue[0].parent_id + position = node_queue[0].position + node_queue.popleft() + + # current node is leaf + if "leaf" in sub_tree: + mb.add_leaf( + tree_id=tree_id, response=sub_tree["leaf"], + parent_id=parent_id, position=position) + continue + + # current node is split + try: + feature_index = int(sub_tree["split"]) + except ValueError: + raise TypeError("Feature names must be integers") + feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) + default_left = int(sub_tree["yes"] == sub_tree["missing"]) + + parent_id = mb.add_split( + tree_id=tree_id, feature_index=feature_index, feature_value=feature_value, + default_left=default_left, parent_id=parent_id, position=position) + + # append to queue + node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) + node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) + + return mb.model() + +def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: + if not model.is_fitted(): + raise RuntimeError( + "Model should be fitted before exporting to daal4py.") + + if model_data is None: + model_data = get_catboost_params(model) + + if 'categorical_features' in model_data['features_info']: + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees") + + n_features = len(model_data['features_info']['float_features']) + + is_symmetric_tree = model_data['model_info']['params']['tree_learner_options']['grow_policy'] == 'SymmetricTree' + + if is_symmetric_tree: + n_iterations = len(model_data['oblivious_trees']) + else: + n_iterations = len(model_data['trees']) + + n_classes = 0 + + if 'class_params' in model_data['model_info']: + is_classification = True + n_classes = len(model_data['model_info'] + ['class_params']['class_to_label']) + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) + else: + is_classification = False + mb = gbt_reg_model_builder(n_features, n_iterations) + + splits = [] + + # Create splits array (all splits are placed sequentially) + for feature in model_data['features_info']['float_features']: + if feature['borders']: + for feature_border in feature['borders']: + splits.append( + {'feature_index': feature['feature_index'], 'value': feature_border}) + + if not is_classification: + bias = model_data['scale_and_bias'][1][0] / n_iterations + scale = model_data['scale_and_bias'][0] + else: + bias = 0 + scale = 1 + + trees_explicit = [] + tree_symmetric = [] + + if model_data['model_info']['params']['data_processing_options']['float_features_binarization']['nan_mode'] == 'Min': + default_left = 1 + else: + default_left = 0 + + for tree_num in range(n_iterations): + if is_symmetric_tree: + + if model_data['oblivious_trees'][tree_num]['splits'] is not None: + # Tree has more than 1 node + cur_tree_depth = len( + model_data['oblivious_trees'][tree_num]['splits']) + else: + cur_tree_depth = 0 + + tree_symmetric.append( + (model_data['oblivious_trees'][tree_num], cur_tree_depth)) + else: + class Node: + def __init__(self, parent=None, split=None, value=None) -> None: + self.right = None + self.left = None + self.split = split + self.value = value + + n_nodes = 1 + # Check if node is a leaf (in case of stump) + if 'split' in model_data['trees'][tree_num]: + # Get number of trees and splits info via BFS + # Create queue + nodes_queue = [] + root_node = Node( + split=splits[model_data['trees'][tree_num]['split']['split_index']]) + nodes_queue.append((model_data['trees'][tree_num], root_node)) + while nodes_queue: + cur_node_data, cur_node = nodes_queue.pop(0) + if 'value' in cur_node_data: + if isinstance(cur_node_data['value'], list): + cur_node.value = [ + value for value in cur_node_data['value']] + else: + cur_node.value = [ + cur_node_data['value'] * scale + bias] + else: + cur_node.split = splits[cur_node_data['split'] + ['split_index']] + left_node = Node() + right_node = Node() + cur_node.left = left_node + cur_node.right = right_node + nodes_queue.append((cur_node_data['left'], left_node)) + nodes_queue.append( + (cur_node_data['right'], right_node)) + n_nodes += 2 + else: + root_node = Node() + if is_classification and n_classes > 2: + root_node.value = [ + value * scale for value in model_data['trees'][tree_num]['value']] + else: + root_node.value = [model_data['trees'][tree_num]['value'] * scale + bias] + trees_explicit.append((root_node, n_nodes)) + + tree_id = [] + class_label = 0 + count = 0 + + # Only 1 tree for each iteration in case of regression or binary classification + if not is_classification or n_classes == 2: + n_tree_each_iter = 1 + else: + n_tree_each_iter = n_classes + + # Create id for trees (for the right order in modelbuilder) + for i in range(n_iterations): + for c in range(n_tree_each_iter): + if is_symmetric_tree: + n_nodes = 2**(tree_symmetric[i][1] + 1) - 1 + else: + n_nodes = trees_explicit[i][1] + + if is_classification and n_classes > 2: + tree_id.append(mb.create_tree(n_nodes, class_label)) + count += 1 + if count == n_iterations: + class_label += 1 + count = 0 + + elif is_classification: + tree_id.append(mb.create_tree(n_nodes, 0)) + else: + tree_id.append(mb.create_tree(n_nodes)) + + + if is_symmetric_tree: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + cur_tree_info = tree_symmetric[i][0] + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + cur_tree_leaf_val = cur_tree_info['leaf_values'] + cur_tree_depth = tree_symmetric[i][1] + + if cur_tree_depth == 0: + mb.add_leaf( + tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) + else: + # One split used for the whole level + cur_level_split = splits[cur_tree_info['splits'] + [cur_tree_depth - 1]['split_index']] + root_id = mb.add_split( + tree_id=cur_tree_id, feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], + default_left=default_left) + prev_level_nodes = [root_id] + + # Iterate over levels, splits in json are reversed (root split is the last) + for cur_level in range(cur_tree_depth - 2, -1, -1): + cur_level_nodes = [] + for cur_parent in prev_level_nodes: + cur_level_split = splits[cur_tree_info['splits'] + [cur_level]['split_index']] + cur_left_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=0, + feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], + default_left=default_left) + cur_right_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=1, + feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], + default_left=default_left) + cur_level_nodes.append(cur_left_node) + cur_level_nodes.append(cur_right_node) + prev_level_nodes = cur_level_nodes + + # Different storing format for leaves + if not is_classification or n_classes == 2: + for last_level_node_num in range(len(prev_level_nodes)): + mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num] + * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) + mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num + 1] + * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) + else: + for last_level_node_num in range(len(prev_level_nodes)): + left_index = 2 * last_level_node_num * n_tree_each_iter + class_label + right_index = (2 * last_level_node_num + 1) * \ + n_tree_each_iter + class_label + mb.add_leaf( + tree_id=cur_tree_id, response=cur_tree_leaf_val[left_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) + mb.add_leaf( + tree_id=cur_tree_id, response=cur_tree_leaf_val[right_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) + else: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + root_node = trees_explicit[i][0] + + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + # Traverse tree via BFS and build tree with modelbuilder + if root_node.value is None: + root_id = mb.add_split( + tree_id=cur_tree_id, feature_index=root_node.split['feature_index'], feature_value=root_node.split['value'], + default_left=default_left) + nodes_queue = [(root_node, root_id)] + while nodes_queue: + cur_node, cur_node_id = nodes_queue.pop(0) + left_node = cur_node.left + # Check if node is a leaf + if left_node.value is None: + left_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=0, + feature_index=left_node.split['feature_index'], feature_value=left_node.split['value'], + default_left=default_left) + nodes_queue.append((left_node, left_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, response=left_node.value[class_label], parent_id=cur_node_id, position=0) + right_node = cur_node.right + # Check if node is a leaf + if right_node.value is None: + right_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=1, + feature_index=right_node.split['feature_index'], feature_value=right_node.split['value'], + default_left=default_left) + nodes_queue.append((right_node, right_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, response=cur_node.right.value[class_label], + parent_id=cur_node_id, position=1) + + else: + # Tree has only one node + mb.add_leaf(tree_id=cur_tree_id, + response=root_node.value[class_label]) + + return mb.model() From 128aff4cefb6c745dfac074e598953eec468ac53 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 27 Jul 2023 03:05:04 -0700 Subject: [PATCH 02/64] use dataclasses for Node --- src/gbt_convertors.py | 32 +-- src/gbt_convertors.pyx | 507 ----------------------------------------- 2 files changed, 17 insertions(+), 522 deletions(-) delete mode 100755 src/gbt_convertors.pyx diff --git a/src/gbt_convertors.py b/src/gbt_convertors.py index de388f1c42..297bfde1dc 100755 --- a/src/gbt_convertors.py +++ b/src/gbt_convertors.py @@ -14,13 +14,15 @@ # limitations under the License. #=============================================================================== -from typing import List, Deque, Dict, Any +from typing import List, Deque, Dict, Any, Optional from collections import deque from os import remove, getpid import json import re from time import time +from attr import dataclass + def get_lightgbm_params(booster): return booster.dump_model() @@ -42,11 +44,11 @@ def get_catboost_params(booster): return model_data def get_gbt_model_from_lightgbm(model: Any, lgb_model = None) -> Any: + @dataclass class Node: - def __init__(self, tree: Dict[str, Any], parent_id: int, position: int): - self.tree = tree - self.parent_id = parent_id - self.position = position + tree: Dict[str, Any] + parent_id: int + position: int if lgb_model is None: lgb_model = get_lightgbm_params(model) @@ -139,11 +141,12 @@ def __init__(self, tree: Dict[str, Any], parent_id: int, position: int): def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: + @dataclass class Node: - def __init__(self, tree: Dict, parent_id: int, position: int): - self.tree = tree - self.parent_id = parent_id - self.position = position + tree: Dict[str, Any] + parent_id: int + position: int + cover: float # Release Note for XGBoost 1.5.0: Python interface now supports configuring # constraints using feature names instead of feature indices. This also @@ -189,7 +192,6 @@ def __init__(self, tree: Dict, parent_id: int, position: int): class_label = 0 iterations_counter = 0 - mis_eq_yes = None for tree in trees_arr: n_nodes = 1 # find out the number of nodes in the tree @@ -330,12 +332,12 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: tree_symmetric.append( (model_data['oblivious_trees'][tree_num], cur_tree_depth)) else: + @dataclass class Node: - def __init__(self, parent=None, split=None, value=None) -> None: - self.right = None - self.left = None - self.split = split - self.value = value + split: Optional[float] = None + value: Optional[list[float]] = None + right: Optional[int] = None + left: Optional[int] = None n_nodes = 1 # Check if node is a leaf (in case of stump) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx deleted file mode 100755 index b6ed202037..0000000000 --- a/src/gbt_convertors.pyx +++ /dev/null @@ -1,507 +0,0 @@ -#=============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -import json -import re -from collections import deque -from os import getpid, remove -from time import time -from typing import Any, Deque, Dict, List - - -def get_lightgbm_params(booster): - return booster.dump_model() - -def get_xgboost_params(booster): - return json.loads(booster.save_config()) - -def get_catboost_params(booster): - dump_filename = f"catboost_model_{getpid()}_{time()}" - - # Dump model in file - booster.save_model(dump_filename, 'json') - - # Read json with model - with open(dump_filename) as file: - model_data = json.load(file) - - # Delete dump file - remove(dump_filename) - return model_data - -def get_gbt_model_from_lightgbm(model: Any, lgb_model = None) -> Any: - class Node: - def __init__(self, tree: Dict[str, Any], parent_id: int, position: int): - self.tree = tree - self.parent_id = parent_id - self.position = position - - if lgb_model is None: - lgb_model = get_lightgbm_params(model) - - n_features = lgb_model["max_feature_idx"] + 1 - n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] - n_classes = lgb_model["num_tree_per_iteration"] - - is_regression = False - objective_fun = lgb_model["objective"] - if n_classes > 2: - if "multiclass" not in objective_fun: - raise TypeError( - "multiclass (softmax) objective is only supported for multiclass classification") - elif "binary" in objective_fun: # nClasses == 1 - n_classes = 2 - else: - is_regression = True - - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) - - class_label = 0 - iterations_counter = 0 - for tree in lgb_model["tree_info"]: - if is_regression: - tree_id = mb.create_tree(tree["num_leaves"]*2-1) - else: - tree_id = mb.create_tree(n_nodes=tree["num_leaves"]*2-1, class_label=class_label) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = tree["tree_structure"] - - # root is leaf - if "leaf_value" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) - continue - - # add root - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, feature_index=sub_tree["split_feature"], - feature_value=feat_val, default_left=default_left) - - # create stack - node_stack: List[Node] = [Node(sub_tree["left_child"], parent_id, 0), - Node(sub_tree["right_child"], parent_id, 1)] - - # dfs through it - while node_stack: - sub_tree = node_stack[-1].tree - parent_id = node_stack[-1].parent_id - position = node_stack[-1].position - node_stack.pop() - - # current node is leaf - if "leaf_index" in sub_tree: - mb.add_leaf( - tree_id=tree_id, response=sub_tree["leaf_value"], - parent_id=parent_id, position=position) - continue - - # current node is split - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, - parent_id=parent_id, position=position) - - # append children - node_stack.append(Node(sub_tree["left_child"], parent_id, 0)) - node_stack.append(Node(sub_tree["right_child"], parent_id, 1)) - - return mb.model() - - -def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: - class Node: - def __init__(self, tree: Dict, parent_id: int, position: int): - self.tree = tree - self.parent_id = parent_id - self.position = position - - # Release Note for XGBoost 1.5.0: Python interface now supports configuring - # constraints using feature names instead of feature indices. This also - # helps with pandas input with set feature names. - lst = [*range(booster.num_features())] - booster.feature_names = [str(i) for i in lst] - - trees_arr = booster.get_dump(dump_format="json") - if xgb_config is None: - xgb_config = get_xgboost_params(booster) - - - n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) - n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) - base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) - - is_regression = False - objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] - if n_classes > 2: - if objective_fun not in ["multi:softprob", "multi:softmax"]: - raise TypeError( - "multi:softprob and multi:softmax are only supported for multiclass classification") - elif objective_fun.find("binary:") == 0: - if objective_fun in ["binary:logistic", "binary:logitraw"]: - n_classes = 2 - else: - raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification") - else: - is_regression = True - - if hasattr(booster, "best_iteration"): - n_iterations = booster.best_iteration + 1 - trees_arr = trees_arr[: n_iterations * (n_classes if n_classes > 2 else 1)] - else: - n_iterations = int(len(trees_arr) / (n_classes if n_classes > 2 else 1)) - - # Create + base iteration - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) - - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) - - class_label = 0 - iterations_counter = 0 - mis_eq_yes = None - for tree in trees_arr: - n_nodes = 1 - # find out the number of nodes in the tree - for node in tree.split("nodeid")[1:]: - node_id = int(node[3:node.find(",")]) - if node_id + 1 > n_nodes: - n_nodes = node_id + 1 - if is_regression: - tree_id = mb.create_tree(n_nodes) - else: - tree_id = mb.create_tree(n_nodes=n_nodes, class_label=class_label) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = json.loads(tree) - - # root is leaf - if "leaf" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf"]) - continue - - # add root - try: - feature_index = int(sub_tree["split"]) - except ValueError: - raise TypeError("Feature names must be integers") - feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) - default_left = int(sub_tree["yes"] == sub_tree["missing"]) - parent_id = mb.add_split(tree_id=tree_id, feature_index=feature_index, - feature_value=feature_value, default_left=default_left) - - # create queue - node_queue: Deque[Node] = deque() - node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) - node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) - - # bfs through it - while node_queue: - sub_tree = node_queue[0].tree - parent_id = node_queue[0].parent_id - position = node_queue[0].position - node_queue.popleft() - - # current node is leaf - if "leaf" in sub_tree: - mb.add_leaf( - tree_id=tree_id, response=sub_tree["leaf"], - parent_id=parent_id, position=position) - continue - - # current node is split - try: - feature_index = int(sub_tree["split"]) - except ValueError: - raise TypeError("Feature names must be integers") - feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) - default_left = int(sub_tree["yes"] == sub_tree["missing"]) - - parent_id = mb.add_split( - tree_id=tree_id, feature_index=feature_index, feature_value=feature_value, - default_left=default_left, parent_id=parent_id, position=position) - - # append to queue - node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) - node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) - - return mb.model() - -def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: - if not model.is_fitted(): - raise RuntimeError( - "Model should be fitted before exporting to daal4py.") - - if model_data is None: - model_data = get_catboost_params(model) - - if 'categorical_features' in model_data['features_info']: - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") - - n_features = len(model_data['features_info']['float_features']) - - is_symmetric_tree = model_data['model_info']['params']['tree_learner_options']['grow_policy'] == 'SymmetricTree' - - if is_symmetric_tree: - n_iterations = len(model_data['oblivious_trees']) - else: - n_iterations = len(model_data['trees']) - - n_classes = 0 - - if 'class_params' in model_data['model_info']: - is_classification = True - n_classes = len(model_data['model_info'] - ['class_params']['class_to_label']) - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) - else: - is_classification = False - mb = gbt_reg_model_builder(n_features, n_iterations) - - splits = [] - - # Create splits array (all splits are placed sequentially) - for feature in model_data['features_info']['float_features']: - if feature['borders']: - for feature_border in feature['borders']: - splits.append( - {'feature_index': feature['feature_index'], 'value': feature_border}) - - if not is_classification: - bias = model_data['scale_and_bias'][1][0] / n_iterations - scale = model_data['scale_and_bias'][0] - else: - bias = 0 - scale = 1 - - trees_explicit = [] - tree_symmetric = [] - - if model_data['model_info']['params']['data_processing_options']['float_features_binarization']['nan_mode'] == 'Min': - default_left = 1 - else: - default_left = 0 - - for tree_num in range(n_iterations): - if is_symmetric_tree: - - if model_data['oblivious_trees'][tree_num]['splits'] is not None: - # Tree has more than 1 node - cur_tree_depth = len( - model_data['oblivious_trees'][tree_num]['splits']) - else: - cur_tree_depth = 0 - - tree_symmetric.append( - (model_data['oblivious_trees'][tree_num], cur_tree_depth)) - else: - class Node: - def __init__(self, parent=None, split=None, value=None) -> None: - self.right = None - self.left = None - self.split = split - self.value = value - - n_nodes = 1 - # Check if node is a leaf (in case of stump) - if 'split' in model_data['trees'][tree_num]: - # Get number of trees and splits info via BFS - # Create queue - nodes_queue = [] - root_node = Node( - split=splits[model_data['trees'][tree_num]['split']['split_index']]) - nodes_queue.append((model_data['trees'][tree_num], root_node)) - while nodes_queue: - cur_node_data, cur_node = nodes_queue.pop(0) - if 'value' in cur_node_data: - if isinstance(cur_node_data['value'], list): - cur_node.value = [ - value for value in cur_node_data['value']] - else: - cur_node.value = [ - cur_node_data['value'] * scale + bias] - else: - cur_node.split = splits[cur_node_data['split'] - ['split_index']] - left_node = Node() - right_node = Node() - cur_node.left = left_node - cur_node.right = right_node - nodes_queue.append((cur_node_data['left'], left_node)) - nodes_queue.append( - (cur_node_data['right'], right_node)) - n_nodes += 2 - else: - root_node = Node() - if is_classification and n_classes > 2: - root_node.value = [ - value * scale for value in model_data['trees'][tree_num]['value']] - else: - root_node.value = [model_data['trees'][tree_num]['value'] * scale + bias] - trees_explicit.append((root_node, n_nodes)) - - tree_id = [] - class_label = 0 - count = 0 - - # Only 1 tree for each iteration in case of regression or binary classification - if not is_classification or n_classes == 2: - n_tree_each_iter = 1 - else: - n_tree_each_iter = n_classes - - # Create id for trees (for the right order in modelbuilder) - for i in range(n_iterations): - for c in range(n_tree_each_iter): - if is_symmetric_tree: - n_nodes = 2**(tree_symmetric[i][1] + 1) - 1 - else: - n_nodes = trees_explicit[i][1] - - if is_classification and n_classes > 2: - tree_id.append(mb.create_tree(n_nodes, class_label)) - count += 1 - if count == n_iterations: - class_label += 1 - count = 0 - - elif is_classification: - tree_id.append(mb.create_tree(n_nodes, 0)) - else: - tree_id.append(mb.create_tree(n_nodes)) - - - if is_symmetric_tree: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - cur_tree_info = tree_symmetric[i][0] - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - cur_tree_leaf_val = cur_tree_info['leaf_values'] - cur_tree_depth = tree_symmetric[i][1] - - if cur_tree_depth == 0: - mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) - else: - # One split used for the whole level - cur_level_split = splits[cur_tree_info['splits'] - [cur_tree_depth - 1]['split_index']] - root_id = mb.add_split( - tree_id=cur_tree_id, feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) - prev_level_nodes = [root_id] - - # Iterate over levels, splits in json are reversed (root split is the last) - for cur_level in range(cur_tree_depth - 2, -1, -1): - cur_level_nodes = [] - for cur_parent in prev_level_nodes: - cur_level_split = splits[cur_tree_info['splits'] - [cur_level]['split_index']] - cur_left_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=0, - feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) - cur_right_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=1, - feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) - cur_level_nodes.append(cur_left_node) - cur_level_nodes.append(cur_right_node) - prev_level_nodes = cur_level_nodes - - # Different storing format for leaves - if not is_classification or n_classes == 2: - for last_level_node_num in range(len(prev_level_nodes)): - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num] - * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num + 1] - * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) - else: - for last_level_node_num in range(len(prev_level_nodes)): - left_index = 2 * last_level_node_num * n_tree_each_iter + class_label - right_index = (2 * last_level_node_num + 1) * \ - n_tree_each_iter + class_label - mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[left_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) - mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[right_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) - else: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - root_node = trees_explicit[i][0] - - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - # Traverse tree via BFS and build tree with modelbuilder - if root_node.value is None: - root_id = mb.add_split( - tree_id=cur_tree_id, feature_index=root_node.split['feature_index'], feature_value=root_node.split['value'], - default_left=default_left) - nodes_queue = [(root_node, root_id)] - while nodes_queue: - cur_node, cur_node_id = nodes_queue.pop(0) - left_node = cur_node.left - # Check if node is a leaf - if left_node.value is None: - left_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=0, - feature_index=left_node.split['feature_index'], feature_value=left_node.split['value'], - default_left=default_left) - nodes_queue.append((left_node, left_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, response=left_node.value[class_label], parent_id=cur_node_id, position=0) - right_node = cur_node.right - # Check if node is a leaf - if right_node.value is None: - right_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=1, - feature_index=right_node.split['feature_index'], feature_value=right_node.split['value'], - default_left=default_left) - nodes_queue.append((right_node, right_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, response=cur_node.right.value[class_label], - parent_id=cur_node_id, position=1) - - else: - # Tree has only one node - mb.add_leaf(tree_id=cur_tree_id, - response=root_node.value[class_label]) - - return mb.model() From db445edf7f105019a6b9ab2572b85499d85da52f Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 27 Jul 2023 03:05:48 -0700 Subject: [PATCH 03/64] isort/black --- generator/gen_daal4py.py | 2 +- src/gbt_convertors.py | 331 ++++++++++++++++++++++++++------------- 2 files changed, 220 insertions(+), 113 deletions(-) diff --git a/generator/gen_daal4py.py b/generator/gen_daal4py.py index defc159d41..404d859e8a 100755 --- a/generator/gen_daal4py.py +++ b/generator/gen_daal4py.py @@ -1235,7 +1235,7 @@ def gen_daal4py(dalroot, outdir, version, warn_all=False, no_dist=False, no_stre ): with open(jp("src", "gbt_model_builder.pyx"), "r") as f: pyx_gbt_model_builder = f.read() - with open(jp('src', 'gbt_convertors.py'), 'r') as f: + with open(jp("src", "gbt_convertors.py"), "r") as f: pyx_gbt_generators = f.read() if ( "algorithms::logistic_regression" in iface.namespace_dict diff --git a/src/gbt_convertors.py b/src/gbt_convertors.py index 297bfde1dc..380a44b82d 100755 --- a/src/gbt_convertors.py +++ b/src/gbt_convertors.py @@ -1,4 +1,4 @@ -#=============================================================================== +# =============================================================================== # Copyright 2020 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,28 +12,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== -from typing import List, Deque, Dict, Any, Optional -from collections import deque -from os import remove, getpid import json import re +from collections import deque +from os import getpid, remove from time import time +from typing import Any, Deque, Dict, List, Optional from attr import dataclass + def get_lightgbm_params(booster): return booster.dump_model() + def get_xgboost_params(booster): return json.loads(booster.save_config()) + def get_catboost_params(booster): dump_filename = f"catboost_model_{getpid()}_{time()}" # Dump model in file - booster.save_model(dump_filename, 'json') + booster.save_model(dump_filename, "json") # Read json with model with open(dump_filename) as file: @@ -43,7 +46,8 @@ def get_catboost_params(booster): remove(dump_filename) return model_data -def get_gbt_model_from_lightgbm(model: Any, lgb_model = None) -> Any: + +def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: @dataclass class Node: tree: Dict[str, Any] @@ -62,7 +66,8 @@ class Node: if n_classes > 2: if "multiclass" not in objective_fun: raise TypeError( - "multiclass (softmax) objective is only supported for multiclass classification") + "multiclass (softmax) objective is only supported for multiclass classification" + ) elif "binary" in objective_fun: # nClasses == 1 n_classes = 2 else: @@ -72,15 +77,18 @@ class Node: mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) else: mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) class_label = 0 iterations_counter = 0 for tree in lgb_model["tree_info"]: if is_regression: - tree_id = mb.create_tree(tree["num_leaves"]*2-1) + tree_id = mb.create_tree(tree["num_leaves"] * 2 - 1) else: - tree_id = mb.create_tree(n_nodes=tree["num_leaves"]*2-1, class_label=class_label) + tree_id = mb.create_tree( + n_nodes=tree["num_leaves"] * 2 - 1, class_label=class_label + ) iterations_counter += 1 if iterations_counter == n_iterations: @@ -97,15 +105,21 @@ class Node: feat_val = sub_tree["threshold"] if isinstance(feat_val, str): raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) default_left = int(sub_tree["default_left"]) parent_id = mb.add_split( - tree_id=tree_id, feature_index=sub_tree["split_feature"], - feature_value=feat_val, default_left=default_left) + tree_id=tree_id, + feature_index=sub_tree["split_feature"], + feature_value=feat_val, + default_left=default_left, + ) # create stack - node_stack: List[Node] = [Node(sub_tree["left_child"], parent_id, 0), - Node(sub_tree["right_child"], parent_id, 1)] + node_stack: List[Node] = [ + Node(sub_tree["left_child"], parent_id, 0), + Node(sub_tree["right_child"], parent_id, 1), + ] # dfs through it while node_stack: @@ -117,21 +131,28 @@ class Node: # current node is leaf if "leaf_index" in sub_tree: mb.add_leaf( - tree_id=tree_id, response=sub_tree["leaf_value"], - parent_id=parent_id, position=position) + tree_id=tree_id, + response=sub_tree["leaf_value"], + parent_id=parent_id, + position=position, + ) continue # current node is split feat_val = sub_tree["threshold"] if isinstance(feat_val, str): raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) default_left = int(sub_tree["default_left"]) parent_id = mb.add_split( - tree_id=tree_id, feature_index=sub_tree["split_feature"], + tree_id=tree_id, + feature_index=sub_tree["split_feature"], feature_value=feat_val, default_left=default_left, - parent_id=parent_id, position=position) + parent_id=parent_id, + position=position, + ) # append children node_stack.append(Node(sub_tree["left_child"], parent_id, 0)) @@ -166,13 +187,15 @@ class Node: if n_classes > 2: if objective_fun not in ["multi:softprob", "multi:softmax"]: raise TypeError( - "multi:softprob and multi:softmax are only supported for multiclass classification") + "multi:softprob and multi:softmax are only supported for multiclass classification" + ) elif objective_fun.find("binary:") == 0: if objective_fun in ["binary:logistic", "binary:logitraw"]: n_classes = 2 else: raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification") + "binary:logistic and binary:logitraw are only supported for binary classification" + ) else: is_regression = True @@ -188,7 +211,8 @@ class Node: mb.add_leaf(tree_id=tree_id, response=base_score) else: mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) class_label = 0 iterations_counter = 0 @@ -196,7 +220,7 @@ class Node: n_nodes = 1 # find out the number of nodes in the tree for node in tree.split("nodeid")[1:]: - node_id = int(node[3:node.find(",")]) + node_id = int(node[3 : node.find(",")]) if node_id + 1 > n_nodes: n_nodes = node_id + 1 if is_regression: @@ -220,10 +244,16 @@ class Node: feature_index = int(sub_tree["split"]) except ValueError: raise TypeError("Feature names must be integers") - feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) + feature_value = np.nextafter( + np.single(sub_tree["split_condition"]), np.single(-np.inf) + ) default_left = int(sub_tree["yes"] == sub_tree["missing"]) - parent_id = mb.add_split(tree_id=tree_id, feature_index=feature_index, - feature_value=feature_value, default_left=default_left) + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=feature_index, + feature_value=feature_value, + default_left=default_left, + ) # create queue node_queue: Deque[Node] = deque() @@ -240,8 +270,11 @@ class Node: # current node is leaf if "leaf" in sub_tree: mb.add_leaf( - tree_id=tree_id, response=sub_tree["leaf"], - parent_id=parent_id, position=position) + tree_id=tree_id, + response=sub_tree["leaf"], + parent_id=parent_id, + position=position, + ) continue # current node is split @@ -249,12 +282,19 @@ class Node: feature_index = int(sub_tree["split"]) except ValueError: raise TypeError("Feature names must be integers") - feature_value = np.nextafter(np.single(sub_tree["split_condition"]), np.single(-np.inf)) + feature_value = np.nextafter( + np.single(sub_tree["split_condition"]), np.single(-np.inf) + ) default_left = int(sub_tree["yes"] == sub_tree["missing"]) parent_id = mb.add_split( - tree_id=tree_id, feature_index=feature_index, feature_value=feature_value, - default_left=default_left, parent_id=parent_id, position=position) + tree_id=tree_id, + feature_index=feature_index, + feature_value=feature_value, + default_left=default_left, + parent_id=parent_id, + position=position, + ) # append to queue node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) @@ -262,35 +302,39 @@ class Node: return mb.model() + def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: if not model.is_fitted(): - raise RuntimeError( - "Model should be fitted before exporting to daal4py.") + raise RuntimeError("Model should be fitted before exporting to daal4py.") if model_data is None: model_data = get_catboost_params(model) - if 'categorical_features' in model_data['features_info']: + if "categorical_features" in model_data["features_info"]: raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees") + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) - n_features = len(model_data['features_info']['float_features']) + n_features = len(model_data["features_info"]["float_features"]) - is_symmetric_tree = model_data['model_info']['params']['tree_learner_options']['grow_policy'] == 'SymmetricTree' + is_symmetric_tree = ( + model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] + == "SymmetricTree" + ) if is_symmetric_tree: - n_iterations = len(model_data['oblivious_trees']) + n_iterations = len(model_data["oblivious_trees"]) else: - n_iterations = len(model_data['trees']) + n_iterations = len(model_data["trees"]) n_classes = 0 - if 'class_params' in model_data['model_info']: + if "class_params" in model_data["model_info"]: is_classification = True - n_classes = len(model_data['model_info'] - ['class_params']['class_to_label']) + n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes) + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) else: is_classification = False mb = gbt_reg_model_builder(n_features, n_iterations) @@ -298,15 +342,16 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: splits = [] # Create splits array (all splits are placed sequentially) - for feature in model_data['features_info']['float_features']: - if feature['borders']: - for feature_border in feature['borders']: + for feature in model_data["features_info"]["float_features"]: + if feature["borders"]: + for feature_border in feature["borders"]: splits.append( - {'feature_index': feature['feature_index'], 'value': feature_border}) + {"feature_index": feature["feature_index"], "value": feature_border} + ) if not is_classification: - bias = model_data['scale_and_bias'][1][0] / n_iterations - scale = model_data['scale_and_bias'][0] + bias = model_data["scale_and_bias"][1][0] / n_iterations + scale = model_data["scale_and_bias"][0] else: bias = 0 scale = 1 @@ -314,24 +359,29 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: trees_explicit = [] tree_symmetric = [] - if model_data['model_info']['params']['data_processing_options']['float_features_binarization']['nan_mode'] == 'Min': + if ( + model_data["model_info"]["params"]["data_processing_options"][ + "float_features_binarization" + ]["nan_mode"] + == "Min" + ): default_left = 1 else: default_left = 0 for tree_num in range(n_iterations): if is_symmetric_tree: - - if model_data['oblivious_trees'][tree_num]['splits'] is not None: + if model_data["oblivious_trees"][tree_num]["splits"] is not None: # Tree has more than 1 node - cur_tree_depth = len( - model_data['oblivious_trees'][tree_num]['splits']) + cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) else: cur_tree_depth = 0 tree_symmetric.append( - (model_data['oblivious_trees'][tree_num], cur_tree_depth)) + (model_data["oblivious_trees"][tree_num], cur_tree_depth) + ) else: + @dataclass class Node: split: Optional[float] = None @@ -341,40 +391,40 @@ class Node: n_nodes = 1 # Check if node is a leaf (in case of stump) - if 'split' in model_data['trees'][tree_num]: + if "split" in model_data["trees"][tree_num]: # Get number of trees and splits info via BFS # Create queue nodes_queue = [] root_node = Node( - split=splits[model_data['trees'][tree_num]['split']['split_index']]) - nodes_queue.append((model_data['trees'][tree_num], root_node)) + split=splits[model_data["trees"][tree_num]["split"]["split_index"]] + ) + nodes_queue.append((model_data["trees"][tree_num], root_node)) while nodes_queue: cur_node_data, cur_node = nodes_queue.pop(0) - if 'value' in cur_node_data: - if isinstance(cur_node_data['value'], list): - cur_node.value = [ - value for value in cur_node_data['value']] + if "value" in cur_node_data: + if isinstance(cur_node_data["value"], list): + cur_node.value = [value for value in cur_node_data["value"]] else: - cur_node.value = [ - cur_node_data['value'] * scale + bias] + cur_node.value = [cur_node_data["value"] * scale + bias] else: - cur_node.split = splits[cur_node_data['split'] - ['split_index']] + cur_node.split = splits[cur_node_data["split"]["split_index"]] left_node = Node() right_node = Node() cur_node.left = left_node cur_node.right = right_node - nodes_queue.append((cur_node_data['left'], left_node)) - nodes_queue.append( - (cur_node_data['right'], right_node)) + nodes_queue.append((cur_node_data["left"], left_node)) + nodes_queue.append((cur_node_data["right"], right_node)) n_nodes += 2 else: root_node = Node() if is_classification and n_classes > 2: root_node.value = [ - value * scale for value in model_data['trees'][tree_num]['value']] + value * scale for value in model_data["trees"][tree_num]["value"] + ] else: - root_node.value = [model_data['trees'][tree_num]['value'] * scale + bias] + root_node.value = [ + model_data["trees"][tree_num]["value"] * scale + bias + ] trees_explicit.append((root_node, n_nodes)) tree_id = [] @@ -391,7 +441,7 @@ class Node: for i in range(n_iterations): for c in range(n_tree_each_iter): if is_symmetric_tree: - n_nodes = 2**(tree_symmetric[i][1] + 1) - 1 + n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 else: n_nodes = trees_explicit[i][1] @@ -407,39 +457,52 @@ class Node: else: tree_id.append(mb.create_tree(n_nodes)) - if is_symmetric_tree: for class_label in range(n_tree_each_iter): for i in range(n_iterations): cur_tree_info = tree_symmetric[i][0] cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - cur_tree_leaf_val = cur_tree_info['leaf_values'] + cur_tree_leaf_val = cur_tree_info["leaf_values"] cur_tree_depth = tree_symmetric[i][1] if cur_tree_depth == 0: - mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) + mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) else: # One split used for the whole level - cur_level_split = splits[cur_tree_info['splits'] - [cur_tree_depth - 1]['split_index']] + cur_level_split = splits[ + cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] + ] root_id = mb.add_split( - tree_id=cur_tree_id, feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) + tree_id=cur_tree_id, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) prev_level_nodes = [root_id] # Iterate over levels, splits in json are reversed (root split is the last) for cur_level in range(cur_tree_depth - 2, -1, -1): cur_level_nodes = [] for cur_parent in prev_level_nodes: - cur_level_split = splits[cur_tree_info['splits'] - [cur_level]['split_index']] - cur_left_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=0, - feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) - cur_right_node = mb.add_split(tree_id=cur_tree_id, parent_id=cur_parent, position=1, - feature_index=cur_level_split['feature_index'], feature_value=cur_level_split['value'], - default_left=default_left) + cur_level_split = splits[ + cur_tree_info["splits"][cur_level]["split_index"] + ] + cur_left_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=0, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) + cur_right_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=1, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) cur_level_nodes.append(cur_left_node) cur_level_nodes.append(cur_right_node) prev_level_nodes = cur_level_nodes @@ -447,19 +510,42 @@ class Node: # Different storing format for leaves if not is_classification or n_classes == 2: for last_level_node_num in range(len(prev_level_nodes)): - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num] - * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[2 * last_level_node_num + 1] - * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num + 1] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + ) else: for last_level_node_num in range(len(prev_level_nodes)): - left_index = 2 * last_level_node_num * n_tree_each_iter + class_label - right_index = (2 * last_level_node_num + 1) * \ - n_tree_each_iter + class_label + left_index = ( + 2 * last_level_node_num * n_tree_each_iter + class_label + ) + right_index = ( + 2 * last_level_node_num + 1 + ) * n_tree_each_iter + class_label mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[left_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0) + tree_id=cur_tree_id, + response=cur_tree_leaf_val[left_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + ) mb.add_leaf( - tree_id=cur_tree_id, response=cur_tree_leaf_val[right_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1) + tree_id=cur_tree_id, + response=cur_tree_leaf_val[right_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + ) else: for class_label in range(n_tree_each_iter): for i in range(n_iterations): @@ -469,36 +555,57 @@ class Node: # Traverse tree via BFS and build tree with modelbuilder if root_node.value is None: root_id = mb.add_split( - tree_id=cur_tree_id, feature_index=root_node.split['feature_index'], feature_value=root_node.split['value'], - default_left=default_left) + tree_id=cur_tree_id, + feature_index=root_node.split["feature_index"], + feature_value=root_node.split["value"], + default_left=default_left, + ) nodes_queue = [(root_node, root_id)] while nodes_queue: cur_node, cur_node_id = nodes_queue.pop(0) left_node = cur_node.left # Check if node is a leaf if left_node.value is None: - left_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=0, - feature_index=left_node.split['feature_index'], feature_value=left_node.split['value'], - default_left=default_left) + left_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=0, + feature_index=left_node.split["feature_index"], + feature_value=left_node.split["value"], + default_left=default_left, + ) nodes_queue.append((left_node, left_node_id)) else: mb.add_leaf( - tree_id=cur_tree_id, response=left_node.value[class_label], parent_id=cur_node_id, position=0) + tree_id=cur_tree_id, + response=left_node.value[class_label], + parent_id=cur_node_id, + position=0, + ) right_node = cur_node.right # Check if node is a leaf if right_node.value is None: - right_node_id = mb.add_split(tree_id=cur_tree_id, parent_id=cur_node_id, position=1, - feature_index=right_node.split['feature_index'], feature_value=right_node.split['value'], - default_left=default_left) + right_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=1, + feature_index=right_node.split["feature_index"], + feature_value=right_node.split["value"], + default_left=default_left, + ) nodes_queue.append((right_node, right_node_id)) else: mb.add_leaf( - tree_id=cur_tree_id, response=cur_node.right.value[class_label], - parent_id=cur_node_id, position=1) + tree_id=cur_tree_id, + response=cur_node.right.value[class_label], + parent_id=cur_node_id, + position=1, + ) else: # Tree has only one node - mb.add_leaf(tree_id=cur_tree_id, - response=root_node.value[class_label]) + mb.add_leaf( + tree_id=cur_tree_id, response=root_node.value[class_label] + ) return mb.model() From 5025f60ef22c2b46729db2039f4ff6d61ba705c6 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 27 Jul 2023 07:50:02 -0700 Subject: [PATCH 04/64] refactor get_gbt_model_from_xgboost() with improved Node classes --- src/gbt_convertors.py | 1351 ++++++++++++++++++++++------------------- 1 file changed, 740 insertions(+), 611 deletions(-) diff --git a/src/gbt_convertors.py b/src/gbt_convertors.py index 380a44b82d..4007e5704d 100755 --- a/src/gbt_convertors.py +++ b/src/gbt_convertors.py @@ -1,611 +1,740 @@ -# =============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import json -import re -from collections import deque -from os import getpid, remove -from time import time -from typing import Any, Deque, Dict, List, Optional - -from attr import dataclass - - -def get_lightgbm_params(booster): - return booster.dump_model() - - -def get_xgboost_params(booster): - return json.loads(booster.save_config()) - - -def get_catboost_params(booster): - dump_filename = f"catboost_model_{getpid()}_{time()}" - - # Dump model in file - booster.save_model(dump_filename, "json") - - # Read json with model - with open(dump_filename) as file: - model_data = json.load(file) - - # Delete dump file - remove(dump_filename) - return model_data - - -def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: - @dataclass - class Node: - tree: Dict[str, Any] - parent_id: int - position: int - - if lgb_model is None: - lgb_model = get_lightgbm_params(model) - - n_features = lgb_model["max_feature_idx"] + 1 - n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] - n_classes = lgb_model["num_tree_per_iteration"] - - is_regression = False - objective_fun = lgb_model["objective"] - if n_classes > 2: - if "multiclass" not in objective_fun: - raise TypeError( - "multiclass (softmax) objective is only supported for multiclass classification" - ) - elif "binary" in objective_fun: # nClasses == 1 - n_classes = 2 - else: - is_regression = True - - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - - class_label = 0 - iterations_counter = 0 - for tree in lgb_model["tree_info"]: - if is_regression: - tree_id = mb.create_tree(tree["num_leaves"] * 2 - 1) - else: - tree_id = mb.create_tree( - n_nodes=tree["num_leaves"] * 2 - 1, class_label=class_label - ) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = tree["tree_structure"] - - # root is leaf - if "leaf_value" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) - continue - - # add root - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, - ) - - # create stack - node_stack: List[Node] = [ - Node(sub_tree["left_child"], parent_id, 0), - Node(sub_tree["right_child"], parent_id, 1), - ] - - # dfs through it - while node_stack: - sub_tree = node_stack[-1].tree - parent_id = node_stack[-1].parent_id - position = node_stack[-1].position - node_stack.pop() - - # current node is leaf - if "leaf_index" in sub_tree: - mb.add_leaf( - tree_id=tree_id, - response=sub_tree["leaf_value"], - parent_id=parent_id, - position=position, - ) - continue - - # current node is split - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, - parent_id=parent_id, - position=position, - ) - - # append children - node_stack.append(Node(sub_tree["left_child"], parent_id, 0)) - node_stack.append(Node(sub_tree["right_child"], parent_id, 1)) - - return mb.model() - - -def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: - @dataclass - class Node: - tree: Dict[str, Any] - parent_id: int - position: int - cover: float - - # Release Note for XGBoost 1.5.0: Python interface now supports configuring - # constraints using feature names instead of feature indices. This also - # helps with pandas input with set feature names. - lst = [*range(booster.num_features())] - booster.feature_names = [str(i) for i in lst] - - if xgb_config is None: - xgb_config = get_xgboost_params(booster) - - n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) - n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) - base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) - - is_regression = False - objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] - if n_classes > 2: - if objective_fun not in ["multi:softprob", "multi:softmax"]: - raise TypeError( - "multi:softprob and multi:softmax are only supported for multiclass classification" - ) - elif objective_fun.find("binary:") == 0: - if objective_fun in ["binary:logistic", "binary:logitraw"]: - n_classes = 2 - else: - raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification" - ) - else: - is_regression = True - - n_iterations = booster.best_iteration + 1 - booster_dump = booster.get_dump(dump_format="json") - trees_arr = booster_dump[: n_iterations * (n_classes if n_classes > 2 else 1)] - - # Create + base iteration - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) - - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - - class_label = 0 - iterations_counter = 0 - for tree in trees_arr: - n_nodes = 1 - # find out the number of nodes in the tree - for node in tree.split("nodeid")[1:]: - node_id = int(node[3 : node.find(",")]) - if node_id + 1 > n_nodes: - n_nodes = node_id + 1 - if is_regression: - tree_id = mb.create_tree(n_nodes) - else: - tree_id = mb.create_tree(n_nodes=n_nodes, class_label=class_label) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = json.loads(tree) - - # root is leaf - if "leaf" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf"]) - continue - - # add root - try: - feature_index = int(sub_tree["split"]) - except ValueError: - raise TypeError("Feature names must be integers") - feature_value = np.nextafter( - np.single(sub_tree["split_condition"]), np.single(-np.inf) - ) - default_left = int(sub_tree["yes"] == sub_tree["missing"]) - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=feature_index, - feature_value=feature_value, - default_left=default_left, - ) - - # create queue - node_queue: Deque[Node] = deque() - node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) - node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) - - # bfs through it - while node_queue: - sub_tree = node_queue[0].tree - parent_id = node_queue[0].parent_id - position = node_queue[0].position - node_queue.popleft() - - # current node is leaf - if "leaf" in sub_tree: - mb.add_leaf( - tree_id=tree_id, - response=sub_tree["leaf"], - parent_id=parent_id, - position=position, - ) - continue - - # current node is split - try: - feature_index = int(sub_tree["split"]) - except ValueError: - raise TypeError("Feature names must be integers") - feature_value = np.nextafter( - np.single(sub_tree["split_condition"]), np.single(-np.inf) - ) - default_left = int(sub_tree["yes"] == sub_tree["missing"]) - - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=feature_index, - feature_value=feature_value, - default_left=default_left, - parent_id=parent_id, - position=position, - ) - - # append to queue - node_queue.append(Node(sub_tree["children"][0], parent_id, 0)) - node_queue.append(Node(sub_tree["children"][1], parent_id, 1)) - - return mb.model() - - -def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: - if not model.is_fitted(): - raise RuntimeError("Model should be fitted before exporting to daal4py.") - - if model_data is None: - model_data = get_catboost_params(model) - - if "categorical_features" in model_data["features_info"]: - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - - n_features = len(model_data["features_info"]["float_features"]) - - is_symmetric_tree = ( - model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] - == "SymmetricTree" - ) - - if is_symmetric_tree: - n_iterations = len(model_data["oblivious_trees"]) - else: - n_iterations = len(model_data["trees"]) - - n_classes = 0 - - if "class_params" in model_data["model_info"]: - is_classification = True - n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - else: - is_classification = False - mb = gbt_reg_model_builder(n_features, n_iterations) - - splits = [] - - # Create splits array (all splits are placed sequentially) - for feature in model_data["features_info"]["float_features"]: - if feature["borders"]: - for feature_border in feature["borders"]: - splits.append( - {"feature_index": feature["feature_index"], "value": feature_border} - ) - - if not is_classification: - bias = model_data["scale_and_bias"][1][0] / n_iterations - scale = model_data["scale_and_bias"][0] - else: - bias = 0 - scale = 1 - - trees_explicit = [] - tree_symmetric = [] - - if ( - model_data["model_info"]["params"]["data_processing_options"][ - "float_features_binarization" - ]["nan_mode"] - == "Min" - ): - default_left = 1 - else: - default_left = 0 - - for tree_num in range(n_iterations): - if is_symmetric_tree: - if model_data["oblivious_trees"][tree_num]["splits"] is not None: - # Tree has more than 1 node - cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) - else: - cur_tree_depth = 0 - - tree_symmetric.append( - (model_data["oblivious_trees"][tree_num], cur_tree_depth) - ) - else: - - @dataclass - class Node: - split: Optional[float] = None - value: Optional[list[float]] = None - right: Optional[int] = None - left: Optional[int] = None - - n_nodes = 1 - # Check if node is a leaf (in case of stump) - if "split" in model_data["trees"][tree_num]: - # Get number of trees and splits info via BFS - # Create queue - nodes_queue = [] - root_node = Node( - split=splits[model_data["trees"][tree_num]["split"]["split_index"]] - ) - nodes_queue.append((model_data["trees"][tree_num], root_node)) - while nodes_queue: - cur_node_data, cur_node = nodes_queue.pop(0) - if "value" in cur_node_data: - if isinstance(cur_node_data["value"], list): - cur_node.value = [value for value in cur_node_data["value"]] - else: - cur_node.value = [cur_node_data["value"] * scale + bias] - else: - cur_node.split = splits[cur_node_data["split"]["split_index"]] - left_node = Node() - right_node = Node() - cur_node.left = left_node - cur_node.right = right_node - nodes_queue.append((cur_node_data["left"], left_node)) - nodes_queue.append((cur_node_data["right"], right_node)) - n_nodes += 2 - else: - root_node = Node() - if is_classification and n_classes > 2: - root_node.value = [ - value * scale for value in model_data["trees"][tree_num]["value"] - ] - else: - root_node.value = [ - model_data["trees"][tree_num]["value"] * scale + bias - ] - trees_explicit.append((root_node, n_nodes)) - - tree_id = [] - class_label = 0 - count = 0 - - # Only 1 tree for each iteration in case of regression or binary classification - if not is_classification or n_classes == 2: - n_tree_each_iter = 1 - else: - n_tree_each_iter = n_classes - - # Create id for trees (for the right order in modelbuilder) - for i in range(n_iterations): - for c in range(n_tree_each_iter): - if is_symmetric_tree: - n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 - else: - n_nodes = trees_explicit[i][1] - - if is_classification and n_classes > 2: - tree_id.append(mb.create_tree(n_nodes, class_label)) - count += 1 - if count == n_iterations: - class_label += 1 - count = 0 - - elif is_classification: - tree_id.append(mb.create_tree(n_nodes, 0)) - else: - tree_id.append(mb.create_tree(n_nodes)) - - if is_symmetric_tree: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - cur_tree_info = tree_symmetric[i][0] - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - cur_tree_leaf_val = cur_tree_info["leaf_values"] - cur_tree_depth = tree_symmetric[i][1] - - if cur_tree_depth == 0: - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) - else: - # One split used for the whole level - cur_level_split = splits[ - cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] - ] - root_id = mb.add_split( - tree_id=cur_tree_id, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - ) - prev_level_nodes = [root_id] - - # Iterate over levels, splits in json are reversed (root split is the last) - for cur_level in range(cur_tree_depth - 2, -1, -1): - cur_level_nodes = [] - for cur_parent in prev_level_nodes: - cur_level_split = splits[ - cur_tree_info["splits"][cur_level]["split_index"] - ] - cur_left_node = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_parent, - position=0, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - ) - cur_right_node = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_parent, - position=1, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - ) - cur_level_nodes.append(cur_left_node) - cur_level_nodes.append(cur_right_node) - prev_level_nodes = cur_level_nodes - - # Different storing format for leaves - if not is_classification or n_classes == 2: - for last_level_node_num in range(len(prev_level_nodes)): - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[2 * last_level_node_num] - * scale - + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=0, - ) - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[2 * last_level_node_num + 1] - * scale - + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=1, - ) - else: - for last_level_node_num in range(len(prev_level_nodes)): - left_index = ( - 2 * last_level_node_num * n_tree_each_iter + class_label - ) - right_index = ( - 2 * last_level_node_num + 1 - ) * n_tree_each_iter + class_label - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[left_index] * scale + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=0, - ) - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[right_index] * scale + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=1, - ) - else: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - root_node = trees_explicit[i][0] - - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - # Traverse tree via BFS and build tree with modelbuilder - if root_node.value is None: - root_id = mb.add_split( - tree_id=cur_tree_id, - feature_index=root_node.split["feature_index"], - feature_value=root_node.split["value"], - default_left=default_left, - ) - nodes_queue = [(root_node, root_id)] - while nodes_queue: - cur_node, cur_node_id = nodes_queue.pop(0) - left_node = cur_node.left - # Check if node is a leaf - if left_node.value is None: - left_node_id = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_node_id, - position=0, - feature_index=left_node.split["feature_index"], - feature_value=left_node.split["value"], - default_left=default_left, - ) - nodes_queue.append((left_node, left_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, - response=left_node.value[class_label], - parent_id=cur_node_id, - position=0, - ) - right_node = cur_node.right - # Check if node is a leaf - if right_node.value is None: - right_node_id = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_node_id, - position=1, - feature_index=right_node.split["feature_index"], - feature_value=right_node.split["value"], - default_left=default_left, - ) - nodes_queue.append((right_node, right_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_node.right.value[class_label], - parent_id=cur_node_id, - position=1, - ) - - else: - # Tree has only one node - mb.add_leaf( - tree_id=cur_tree_id, response=root_node.value[class_label] - ) - - return mb.model() +# =============================================================================== +# Copyright 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import json +from collections import deque +from dataclasses import dataclass +from os import getpid, remove +from time import time +from typing import TYPE_CHECKING, Any, Deque, Dict, Generator, List, Optional + +import numpy as np + +if TYPE_CHECKING: + import xgboost as xgb + + +def get_lightgbm_params(booster): + return booster.dump_model() + + +def get_xgboost_params(booster): + return json.loads(booster.save_config()) + + +def get_catboost_params(booster): + dump_filename = f"catboost_model_{getpid()}_{time()}" + + # Dump model in file + booster.save_model(dump_filename, "json") + + # Read json with model + with open(dump_filename) as file: + model_data = json.load(file) + + # Delete dump file + remove(dump_filename) + return model_data + + +def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: + @dataclass + class Node: + tree: Dict[str, Any] + parent_id: int + position: int + + if lgb_model is None: + lgb_model = get_lightgbm_params(model) + + n_features = lgb_model["max_feature_idx"] + 1 + n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] + n_classes = lgb_model["num_tree_per_iteration"] + + is_regression = False + objective_fun = lgb_model["objective"] + if n_classes > 2: + if "multiclass" not in objective_fun: + raise TypeError( + "multiclass (softmax) objective is only supported for multiclass classification" + ) + elif "binary" in objective_fun: # nClasses == 1 + n_classes = 2 + else: + is_regression = True + + if is_regression: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + + class_label = 0 + iterations_counter = 0 + for tree in lgb_model["tree_info"]: + if is_regression: + tree_id = mb.create_tree(tree["num_leaves"] * 2 - 1) + else: + tree_id = mb.create_tree( + n_nodes=tree["num_leaves"] * 2 - 1, class_label=class_label + ) + + iterations_counter += 1 + if iterations_counter == n_iterations: + iterations_counter = 0 + class_label += 1 + sub_tree = tree["tree_structure"] + + # root is leaf + if "leaf_value" in sub_tree: + mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) + continue + + # add root + feat_val = sub_tree["threshold"] + if isinstance(feat_val, str): + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + default_left = int(sub_tree["default_left"]) + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=sub_tree["split_feature"], + feature_value=feat_val, + default_left=default_left, + ) + + # create stack + node_stack: List[Node] = [ + Node(sub_tree["left_child"], parent_id, 0), + Node(sub_tree["right_child"], parent_id, 1), + ] + + # dfs through it + while node_stack: + sub_tree = node_stack[-1].tree + parent_id = node_stack[-1].parent_id + position = node_stack[-1].position + node_stack.pop() + + # current node is leaf + if "leaf_index" in sub_tree: + mb.add_leaf( + tree_id=tree_id, + response=sub_tree["leaf_value"], + parent_id=parent_id, + position=position, + ) + continue + + # current node is split + feat_val = sub_tree["threshold"] + if isinstance(feat_val, str): + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + default_left = int(sub_tree["default_left"]) + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=sub_tree["split_feature"], + feature_value=feat_val, + default_left=default_left, + parent_id=parent_id, + position=position, + ) + + # append children + node_stack.append(Node(sub_tree["left_child"], parent_id, 0)) + node_stack.append(Node(sub_tree["right_child"], parent_id, 1)) + + return mb.model() + + +def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: + class NodeList(list): + """Helper class that is able to extract all information required by the + model builders from an XGBooster.Booster object""" + + @dataclass + class Node: + tree_id: int + node_id: int + left_child_id: Optional[int] + right_child_id: Optional[int] + cover: float + is_leaf: bool + default_left: bool + feature: Optional[int] + value: Optional[float] + parent_id: Optional[int] = -1 + position: Optional[int] = -1 + + def get_value_closest_float_downward(self) -> np.float64: + """Get the closest exact fp value smaller than self.value""" + return np.nextafter(np.single(self.value), np.single(-np.inf)) + + class TreeView: + def __init__(self, tree_id: int, nodes: "list[NodeList.Node]") -> None: + self.tree_id = tree_id + self.nodes = nodes + self.n_nodes = len(nodes) + + @property + def is_leaf(self) -> bool: + return len(self.nodes) == 1 and self.nodes[0].is_leaf + + @property + def value(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + if not self.nodes[0].value: + raise ValueError("Tree is leaf-only but leaf node has no value") + return self.nodes[0].value + + def get_children( + self, node: "NodeList.Node" + ) -> "tuple[NodeList.Node, NodeList.Node]": + """Find children of the provided node""" + children_ids = (node.left_child_id, node.right_child_id) + selection = [n for n in self.nodes if n.node_id in children_ids] + assert ( + len(selection) == 2 + ), f"Found {len(selection)} (!= 2) child nodes for node {node}" + return tuple(selection) + + @staticmethod + def from_booster(booster: xgb.Booster) -> "NodeList": + """Create a TreeList object from a xgb.Booster object""" + tl = NodeList() + df = booster.trees_to_dataframe() + for _, node in df.iterrows(): + tree_id, node_id = map(int, node["ID"].split("-")) # e.g. 0-1 + is_leaf = node["Feature"] == "Leaf" + left_child_id = ( + int(node["Yes"].split("-")[1]) + if isinstance(node["Yes"], str) + else None + ) + right_child_id = ( + int(node["No"].split("-")[1]) if isinstance(node["No"], str) else None + ) + assert ( + left_child_id is None or left_child_id != right_child_id + ), "Children ID mismatch" + tl.append( + NodeList.Node( + tree_id=tree_id, + node_id=node_id, + left_child_id=left_child_id, + right_child_id=right_child_id, + cover=node["Cover"], + feature=int(node["Feature"]) + if node["Feature"].isnumeric() + else None, + is_leaf=is_leaf, + default_left=node["Yes"] == node["Missing"], + value=None if is_leaf else node["Split"], + ) + ) + + # fill the missing leaf values which are not part of the dataframe + tl._fill_leaf_values(booster.get_dump(dump_format="json")) + + return tl + + def iter_trees(self) -> "Generator[NodeList.TreeView, None, None]": + """Iterate over TreeViews""" + tree_ids = set((node.tree_id for node in self)) + for tid in tree_ids: + yield NodeList.TreeView( + tree_id=tid, nodes=[n for n in self if n.tree_id == tid] + ) + + def _fill_leaf_values(self, booster_dump: list[str]) -> None: + """Fill the leaf values (i.e. the predictions) from `booster_dump` + Note: These values are not contained in the pd.DataFrame format""" + + def get_leaf_nodes( + node: Dict[str, Any], leaf_nodes: list[Dict[str, Any]] = [] + ) -> None: + """Helper to get all leaf nodes from the json.loads() of the booster_dump""" + if "children" in node: + get_leaf_nodes(node["children"][0], leaf_nodes) + get_leaf_nodes(node["children"][1], leaf_nodes) + return + + if "leaf" not in node: + raise KeyError(f"Node does not have a 'leaf' value: {node}") + + leaf_nodes.append(node) + + root_nodes = [json.loads(s) for s in booster_dump] + + for tree_id, root_node in enumerate(root_nodes): + # + leaf_nodes = [] + get_leaf_nodes(root_node, leaf_nodes) + + for node in self: + if not node.is_leaf: + continue + + if node.tree_id != tree_id: + continue + + try: + node.value = float( + [ + l["leaf"] + for l in leaf_nodes + if l["nodeid"] == node.node_id + ].pop() + ) + except IndexError as e: + raise ValueError( + f"No leaf information for node {node.node_id} in tree {node.tree_id}" + ) from e + + # assert all tree leafs have a value + for node in self: + if node.is_leaf: + assert ( + node.value is not None + ), f"Failed to find leaf value for node {node}" + + def __setitem__(self): + raise NotImplementedError( + "Use TreeList.from_booster() to initialize a TreeList" + ) + + # Release Note for XGBoost 1.5.0: Python interface now supports configuring + # constraints using feature names instead of feature indices. This also + # helps with pandas input with set feature names. + booster.feature_names = [str(i) for i in range(booster.num_features())] + + if xgb_config is None: + xgb_config = get_xgboost_params(booster) + + n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) + n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) + base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) + + is_regression = False + objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] + if n_classes > 2: + if objective_fun not in ["multi:softprob", "multi:softmax"]: + raise TypeError( + "multi:softprob and multi:softmax are only supported for multiclass classification" + ) + elif objective_fun.find("binary:") == 0: + if objective_fun in ["binary:logistic", "binary:logitraw"]: + n_classes = 2 + else: + raise TypeError( + "binary:logistic and binary:logitraw are only supported for binary classification" + ) + else: + is_regression = True + + n_iterations = booster.best_iteration + 1 + + # Create + base iteration + if is_regression: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) + + tree_id = mb.create_tree(1) + mb.add_leaf(tree_id=tree_id, response=base_score) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + + class_label = 0 + node_list = NodeList.from_booster(booster) + for counter, tree in enumerate(node_list.iter_trees(), start=1): + # find out the number of nodes in the tree + if is_regression: + tree_id = mb.create_tree(tree.n_nodes) + else: + tree_id = mb.create_tree(n_nodes=tree.n_nodes, class_label=class_label) + + if counter % n_iterations == 0: + class_label += 1 + + if tree.is_leaf: + mb.add_leaf(tree_id=tree_id, response=tree.value) + continue + + root_node = tree.nodes[0] + assert isinstance( + root_node.feature, int + ), f"Feature names must be integers (got ({type(root_node.feature)}){root_node.feature})" + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=root_node.feature, + feature_value=root_node.get_value_closest_float_downward(), + default_left=root_node.default_left, + ) + + # create queue + node_queue: Deque[NodeList.Node] = deque() + children = tree.get_children(root_node) + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + while node_queue: + node = node_queue.popleft() + assert node.parent_id != -1, "node.parent_id must not be -1" + assert node.position != -1, "node.position must not be -1" + + if node.is_leaf: + mb.add_leaf( + tree_id=tree_id, + response=node.value, + parent_id=node.parent_id, + position=node.position, + ) + else: + assert isinstance( + node.feature, int + ), f"Feature names must be integers (got ({type(node.feature)}){node.feature})" + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=node.feature, + feature_value=node.get_value_closest_float_downward(), + default_left=node.default_left, + parent_id=node.parent_id, + position=node.position, + ) + + children = tree.get_children(node) + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + return mb.model() + + +def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: + if not model.is_fitted(): + raise RuntimeError("Model should be fitted before exporting to daal4py.") + + if model_data is None: + model_data = get_catboost_params(model) + + if "categorical_features" in model_data["features_info"]: + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + + n_features = len(model_data["features_info"]["float_features"]) + + is_symmetric_tree = ( + model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] + == "SymmetricTree" + ) + + if is_symmetric_tree: + n_iterations = len(model_data["oblivious_trees"]) + else: + n_iterations = len(model_data["trees"]) + + n_classes = 0 + + if "class_params" in model_data["model_info"]: + is_classification = True + n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + else: + is_classification = False + mb = gbt_reg_model_builder(n_features, n_iterations) + + splits = [] + + # Create splits array (all splits are placed sequentially) + for feature in model_data["features_info"]["float_features"]: + if feature["borders"]: + for feature_border in feature["borders"]: + splits.append( + {"feature_index": feature["feature_index"], "value": feature_border} + ) + + if not is_classification: + bias = model_data["scale_and_bias"][1][0] / n_iterations + scale = model_data["scale_and_bias"][0] + else: + bias = 0 + scale = 1 + + trees_explicit = [] + tree_symmetric = [] + + if ( + model_data["model_info"]["params"]["data_processing_options"][ + "float_features_binarization" + ]["nan_mode"] + == "Min" + ): + default_left = 1 + else: + default_left = 0 + + for tree_num in range(n_iterations): + if is_symmetric_tree: + if model_data["oblivious_trees"][tree_num]["splits"] is not None: + # Tree has more than 1 node + cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) + else: + cur_tree_depth = 0 + + tree_symmetric.append( + (model_data["oblivious_trees"][tree_num], cur_tree_depth) + ) + else: + + @dataclass + class Node: + split: Optional[float] = None + value: Optional[list[float]] = None + right: Optional[int] = None + left: Optional[int] = None + + n_nodes = 1 + # Check if node is a leaf (in case of stump) + if "split" in model_data["trees"][tree_num]: + # Get number of trees and splits info via BFS + # Create queue + nodes_queue = [] + root_node = Node( + split=splits[model_data["trees"][tree_num]["split"]["split_index"]] + ) + nodes_queue.append((model_data["trees"][tree_num], root_node)) + while nodes_queue: + cur_node_data, cur_node = nodes_queue.pop(0) + if "value" in cur_node_data: + if isinstance(cur_node_data["value"], list): + cur_node.value = [value for value in cur_node_data["value"]] + else: + cur_node.value = [cur_node_data["value"] * scale + bias] + else: + cur_node.split = splits[cur_node_data["split"]["split_index"]] + left_node = Node() + right_node = Node() + cur_node.left = left_node + cur_node.right = right_node + nodes_queue.append((cur_node_data["left"], left_node)) + nodes_queue.append((cur_node_data["right"], right_node)) + n_nodes += 2 + else: + root_node = Node() + if is_classification and n_classes > 2: + root_node.value = [ + value * scale for value in model_data["trees"][tree_num]["value"] + ] + else: + root_node.value = [ + model_data["trees"][tree_num]["value"] * scale + bias + ] + trees_explicit.append((root_node, n_nodes)) + + tree_id = [] + class_label = 0 + count = 0 + + # Only 1 tree for each iteration in case of regression or binary classification + if not is_classification or n_classes == 2: + n_tree_each_iter = 1 + else: + n_tree_each_iter = n_classes + + # Create id for trees (for the right order in modelbuilder) + for i in range(n_iterations): + for c in range(n_tree_each_iter): + if is_symmetric_tree: + n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 + else: + n_nodes = trees_explicit[i][1] + + if is_classification and n_classes > 2: + tree_id.append(mb.create_tree(n_nodes, class_label)) + count += 1 + if count == n_iterations: + class_label += 1 + count = 0 + + elif is_classification: + tree_id.append(mb.create_tree(n_nodes, 0)) + else: + tree_id.append(mb.create_tree(n_nodes)) + + if is_symmetric_tree: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + cur_tree_info = tree_symmetric[i][0] + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + cur_tree_leaf_val = cur_tree_info["leaf_values"] + cur_tree_depth = tree_symmetric[i][1] + + if cur_tree_depth == 0: + mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) + else: + # One split used for the whole level + cur_level_split = splits[ + cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] + ] + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) + prev_level_nodes = [root_id] + + # Iterate over levels, splits in json are reversed (root split is the last) + for cur_level in range(cur_tree_depth - 2, -1, -1): + cur_level_nodes = [] + for cur_parent in prev_level_nodes: + cur_level_split = splits[ + cur_tree_info["splits"][cur_level]["split_index"] + ] + cur_left_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=0, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) + cur_right_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=1, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) + cur_level_nodes.append(cur_left_node) + cur_level_nodes.append(cur_right_node) + prev_level_nodes = cur_level_nodes + + # Different storing format for leaves + if not is_classification or n_classes == 2: + for last_level_node_num in range(len(prev_level_nodes)): + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num + 1] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + ) + else: + for last_level_node_num in range(len(prev_level_nodes)): + left_index = ( + 2 * last_level_node_num * n_tree_each_iter + class_label + ) + right_index = ( + 2 * last_level_node_num + 1 + ) * n_tree_each_iter + class_label + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[left_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[right_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + ) + else: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + root_node = trees_explicit[i][0] + + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + # Traverse tree via BFS and build tree with modelbuilder + if root_node.value is None: + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=root_node.split["feature_index"], + feature_value=root_node.split["value"], + default_left=default_left, + ) + nodes_queue = [(root_node, root_id)] + while nodes_queue: + cur_node, cur_node_id = nodes_queue.pop(0) + left_node = cur_node.left + # Check if node is a leaf + if left_node.value is None: + left_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=0, + feature_index=left_node.split["feature_index"], + feature_value=left_node.split["value"], + default_left=default_left, + ) + nodes_queue.append((left_node, left_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=left_node.value[class_label], + parent_id=cur_node_id, + position=0, + ) + right_node = cur_node.right + # Check if node is a leaf + if right_node.value is None: + right_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=1, + feature_index=right_node.split["feature_index"], + feature_value=right_node.split["value"], + default_left=default_left, + ) + nodes_queue.append((right_node, right_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_node.right.value[class_label], + parent_id=cur_node_id, + position=1, + ) + + else: + # Tree has only one node + mb.add_leaf( + tree_id=cur_tree_id, response=root_node.value[class_label] + ) + + return mb.model() From a5695ae79c4f7343653c520a31feb66d6de2d3f6 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 27 Jul 2023 08:03:59 -0700 Subject: [PATCH 05/64] refactor: put new NodeList and related classes in module namespace --- src/gbt_convertors.py | 327 ++++++++++++++++++++---------------------- 1 file changed, 159 insertions(+), 168 deletions(-) diff --git a/src/gbt_convertors.py b/src/gbt_convertors.py index 4007e5704d..e24ae26f42 100755 --- a/src/gbt_convertors.py +++ b/src/gbt_convertors.py @@ -27,6 +27,153 @@ import xgboost as xgb +@dataclass +class Node: + """Helper class holding Tree Node information""" + + tree_id: int + node_id: int + left_child_id: Optional[int] + right_child_id: Optional[int] + cover: float + is_leaf: bool + default_left: bool + feature: Optional[int] + value: Optional[float] + parent_id: Optional[int] = -1 + position: Optional[int] = -1 + + def get_value_closest_float_downward(self) -> np.float64: + """Get the closest exact fp value smaller than self.value""" + return np.nextafter(np.single(self.value), np.single(-np.inf)) + + +class TreeView: + """Helper class, treating a list of nodes as one tree""" + + def __init__(self, tree_id: int, nodes: list[Node]) -> None: + self.tree_id = tree_id + self.nodes = nodes + self.n_nodes = len(nodes) + + @property + def is_leaf(self) -> bool: + return len(self.nodes) == 1 and self.nodes[0].is_leaf + + @property + def value(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + if not self.nodes[0].value: + raise ValueError("Tree is leaf-only but leaf node has no value") + return self.nodes[0].value + + def get_children(self, node: Node) -> tuple[Node, Node]: + """Find children of the provided node""" + children_ids = (node.left_child_id, node.right_child_id) + selection = [n for n in self.nodes if n.node_id in children_ids] + assert ( + len(selection) == 2 + ), f"Found {len(selection)} (!= 2) child nodes for node {node}" + return tuple(selection) + + +class NodeList(list): + """Helper class that is able to extract all information required by the + model builders from an XGBoost.Booster object""" + + @staticmethod + def from_booster(booster: xgb.Booster) -> "NodeList": + """Create a TreeList object from a xgb.Booster object""" + tl = NodeList() + df = booster.trees_to_dataframe() + for _, node in df.iterrows(): + tree_id, node_id = map(int, node["ID"].split("-")) # e.g. 0-1 + is_leaf = node["Feature"] == "Leaf" + left_child_id = ( + int(node["Yes"].split("-")[1]) if isinstance(node["Yes"], str) else None + ) + right_child_id = ( + int(node["No"].split("-")[1]) if isinstance(node["No"], str) else None + ) + tl.append( + Node( + tree_id=tree_id, + node_id=node_id, + left_child_id=left_child_id, + right_child_id=right_child_id, + cover=node["Cover"], + feature=int(node["Feature"]) if node["Feature"].isnumeric() else None, + is_leaf=is_leaf, + default_left=node["Yes"] == node["Missing"], + value=None if is_leaf else node["Split"], + ) + ) + + # fill the missing leaf values which are not part of the dataframe + tl._fill_leaf_values(booster.get_dump(dump_format="json")) + + return tl + + def iter_trees(self) -> Generator[TreeView, None, None]: + """Iterate over TreeViews""" + tree_ids = set((node.tree_id for node in self)) + for tid in tree_ids: + yield TreeView(tree_id=tid, nodes=[n for n in self if n.tree_id == tid]) + + def _fill_leaf_values(self, booster_dump: list[str]) -> None: + """Fill the leaf values (i.e. the predictions) from `booster_dump` + Note: These values are not contained in the pd.DataFrame format""" + + def get_leaf_nodes( + node: Dict[str, Any], leaf_nodes: list[Dict[str, Any]] = [] + ) -> None: + """Helper to get all leaf nodes from the json.loads() of the booster_dump""" + if "children" in node: + get_leaf_nodes(node["children"][0], leaf_nodes) + get_leaf_nodes(node["children"][1], leaf_nodes) + return + + if "leaf" not in node: + raise KeyError(f"Node does not have a 'leaf' value: {node}") + + leaf_nodes.append(node) + + root_nodes = [json.loads(s) for s in booster_dump] + + for tree_id, root_node in enumerate(root_nodes): + leaf_nodes = [] + get_leaf_nodes(root_node, leaf_nodes) + + for node in self: + if not node.is_leaf: + continue + + if node.tree_id != tree_id: + continue + + try: + node.value = float( + [ + l["leaf"] for l in leaf_nodes if l["nodeid"] == node.node_id + ].pop() + ) + except IndexError as e: + raise ValueError( + f"No leaf information for node {node.node_id} in tree {node.tree_id}" + ) from e + + # assert all tree leafs have a value + for node in self: + if node.is_leaf: + assert ( + node.value is not None + ), f"Failed to find leaf value for node {node}" + + def __setitem__(self): + raise NotImplementedError("Use TreeList.from_booster() to initialize a TreeList") + + def get_lightgbm_params(booster): return booster.dump_model() @@ -52,7 +199,7 @@ def get_catboost_params(booster): def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: @dataclass - class Node: + class LightGbmNode: tree: Dict[str, Any] parent_id: int position: int @@ -119,9 +266,9 @@ class Node: ) # create stack - node_stack: List[Node] = [ - Node(sub_tree["left_child"], parent_id, 0), - Node(sub_tree["right_child"], parent_id, 1), + node_stack: List[LightGbmNode] = [ + LightGbmNode(sub_tree["left_child"], parent_id, 0), + LightGbmNode(sub_tree["right_child"], parent_id, 1), ] # dfs through it @@ -158,169 +305,13 @@ class Node: ) # append children - node_stack.append(Node(sub_tree["left_child"], parent_id, 0)) - node_stack.append(Node(sub_tree["right_child"], parent_id, 1)) + node_stack.append(LightGbmNode(sub_tree["left_child"], parent_id, 0)) + node_stack.append(LightGbmNode(sub_tree["right_child"], parent_id, 1)) return mb.model() def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: - class NodeList(list): - """Helper class that is able to extract all information required by the - model builders from an XGBooster.Booster object""" - - @dataclass - class Node: - tree_id: int - node_id: int - left_child_id: Optional[int] - right_child_id: Optional[int] - cover: float - is_leaf: bool - default_left: bool - feature: Optional[int] - value: Optional[float] - parent_id: Optional[int] = -1 - position: Optional[int] = -1 - - def get_value_closest_float_downward(self) -> np.float64: - """Get the closest exact fp value smaller than self.value""" - return np.nextafter(np.single(self.value), np.single(-np.inf)) - - class TreeView: - def __init__(self, tree_id: int, nodes: "list[NodeList.Node]") -> None: - self.tree_id = tree_id - self.nodes = nodes - self.n_nodes = len(nodes) - - @property - def is_leaf(self) -> bool: - return len(self.nodes) == 1 and self.nodes[0].is_leaf - - @property - def value(self) -> float: - if not self.is_leaf: - raise ValueError("Tree is not a leaf-only tree") - if not self.nodes[0].value: - raise ValueError("Tree is leaf-only but leaf node has no value") - return self.nodes[0].value - - def get_children( - self, node: "NodeList.Node" - ) -> "tuple[NodeList.Node, NodeList.Node]": - """Find children of the provided node""" - children_ids = (node.left_child_id, node.right_child_id) - selection = [n for n in self.nodes if n.node_id in children_ids] - assert ( - len(selection) == 2 - ), f"Found {len(selection)} (!= 2) child nodes for node {node}" - return tuple(selection) - - @staticmethod - def from_booster(booster: xgb.Booster) -> "NodeList": - """Create a TreeList object from a xgb.Booster object""" - tl = NodeList() - df = booster.trees_to_dataframe() - for _, node in df.iterrows(): - tree_id, node_id = map(int, node["ID"].split("-")) # e.g. 0-1 - is_leaf = node["Feature"] == "Leaf" - left_child_id = ( - int(node["Yes"].split("-")[1]) - if isinstance(node["Yes"], str) - else None - ) - right_child_id = ( - int(node["No"].split("-")[1]) if isinstance(node["No"], str) else None - ) - assert ( - left_child_id is None or left_child_id != right_child_id - ), "Children ID mismatch" - tl.append( - NodeList.Node( - tree_id=tree_id, - node_id=node_id, - left_child_id=left_child_id, - right_child_id=right_child_id, - cover=node["Cover"], - feature=int(node["Feature"]) - if node["Feature"].isnumeric() - else None, - is_leaf=is_leaf, - default_left=node["Yes"] == node["Missing"], - value=None if is_leaf else node["Split"], - ) - ) - - # fill the missing leaf values which are not part of the dataframe - tl._fill_leaf_values(booster.get_dump(dump_format="json")) - - return tl - - def iter_trees(self) -> "Generator[NodeList.TreeView, None, None]": - """Iterate over TreeViews""" - tree_ids = set((node.tree_id for node in self)) - for tid in tree_ids: - yield NodeList.TreeView( - tree_id=tid, nodes=[n for n in self if n.tree_id == tid] - ) - - def _fill_leaf_values(self, booster_dump: list[str]) -> None: - """Fill the leaf values (i.e. the predictions) from `booster_dump` - Note: These values are not contained in the pd.DataFrame format""" - - def get_leaf_nodes( - node: Dict[str, Any], leaf_nodes: list[Dict[str, Any]] = [] - ) -> None: - """Helper to get all leaf nodes from the json.loads() of the booster_dump""" - if "children" in node: - get_leaf_nodes(node["children"][0], leaf_nodes) - get_leaf_nodes(node["children"][1], leaf_nodes) - return - - if "leaf" not in node: - raise KeyError(f"Node does not have a 'leaf' value: {node}") - - leaf_nodes.append(node) - - root_nodes = [json.loads(s) for s in booster_dump] - - for tree_id, root_node in enumerate(root_nodes): - # - leaf_nodes = [] - get_leaf_nodes(root_node, leaf_nodes) - - for node in self: - if not node.is_leaf: - continue - - if node.tree_id != tree_id: - continue - - try: - node.value = float( - [ - l["leaf"] - for l in leaf_nodes - if l["nodeid"] == node.node_id - ].pop() - ) - except IndexError as e: - raise ValueError( - f"No leaf information for node {node.node_id} in tree {node.tree_id}" - ) from e - - # assert all tree leafs have a value - for node in self: - if node.is_leaf: - assert ( - node.value is not None - ), f"Failed to find leaf value for node {node}" - - def __setitem__(self): - raise NotImplementedError( - "Use TreeList.from_booster() to initialize a TreeList" - ) - # Release Note for XGBoost 1.5.0: Python interface now supports configuring # constraints using feature names instead of feature indices. This also # helps with pandas input with set feature names. @@ -391,7 +382,7 @@ def __setitem__(self): ) # create queue - node_queue: Deque[NodeList.Node] = deque() + node_queue: Deque[Node] = deque() children = tree.get_children(root_node) for position, child in enumerate(children): child.parent_id = parent_id @@ -512,7 +503,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: else: @dataclass - class Node: + class CatBoostNode: split: Optional[float] = None value: Optional[list[float]] = None right: Optional[int] = None @@ -524,7 +515,7 @@ class Node: # Get number of trees and splits info via BFS # Create queue nodes_queue = [] - root_node = Node( + root_node = CatBoostNode( split=splits[model_data["trees"][tree_num]["split"]["split_index"]] ) nodes_queue.append((model_data["trees"][tree_num], root_node)) @@ -537,15 +528,15 @@ class Node: cur_node.value = [cur_node_data["value"] * scale + bias] else: cur_node.split = splits[cur_node_data["split"]["split_index"]] - left_node = Node() - right_node = Node() + left_node = CatBoostNode() + right_node = CatBoostNode() cur_node.left = left_node cur_node.right = right_node nodes_queue.append((cur_node_data["left"], left_node)) nodes_queue.append((cur_node_data["right"], right_node)) n_nodes += 2 else: - root_node = Node() + root_node = CatBoostNode() if is_classification and n_classes > 2: root_node.value = [ value * scale for value in model_data["trees"][tree_num]["value"] From 9b7255398ff4b26463c2aeefac435808f297f64e Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 28 Jul 2023 04:14:17 -0700 Subject: [PATCH 06/64] add cover to gbt regression nodes --- generator/gen_daal4py.py | 2 +- src/{gbt_convertors.py => gbt_convertors.pyx} | 86 +++++++++++++------ src/gbt_model_builder.h | 10 +-- src/gbt_model_builder.pyx | 22 +++-- 4 files changed, 80 insertions(+), 40 deletions(-) rename src/{gbt_convertors.py => gbt_convertors.pyx} (93%) diff --git a/generator/gen_daal4py.py b/generator/gen_daal4py.py index 404d859e8a..c808aa25c8 100755 --- a/generator/gen_daal4py.py +++ b/generator/gen_daal4py.py @@ -1235,7 +1235,7 @@ def gen_daal4py(dalroot, outdir, version, warn_all=False, no_dist=False, no_stre ): with open(jp("src", "gbt_model_builder.pyx"), "r") as f: pyx_gbt_model_builder = f.read() - with open(jp("src", "gbt_convertors.py"), "r") as f: + with open(jp("src", "gbt_convertors.pyx"), "r") as f: pyx_gbt_generators = f.read() if ( "algorithms::logistic_regression" in iface.namespace_dict diff --git a/src/gbt_convertors.py b/src/gbt_convertors.pyx similarity index 93% rename from src/gbt_convertors.py rename to src/gbt_convertors.pyx index e24ae26f42..bb58290f2a 100755 --- a/src/gbt_convertors.py +++ b/src/gbt_convertors.pyx @@ -16,18 +16,35 @@ import json from collections import deque -from dataclasses import dataclass from os import getpid, remove from time import time -from typing import TYPE_CHECKING, Any, Deque, Dict, Generator, List, Optional +from typing import Any, Deque, Dict, Generator, List, Optional, Tuple import numpy as np +import xgboost as xgb -if TYPE_CHECKING: - import xgboost as xgb + +class CatBoostNode: + def __init__( + self, + split: Optional[float] = None, + value: Optional[List[float]] = None, + right: Optional[int] = None, + left: Optional[float] = None, + ) -> None: + self.split = split + self.value = value + self.right = right + self.left = left + + +class LightGbmNode: + def __init__(self, tree: Dict[str, Any], parent_id: int, position: int) -> None: + self.tree = tree + self.parent_id = parent_id + self.position = position -@dataclass class Node: """Helper class holding Tree Node information""" @@ -43,6 +60,32 @@ class Node: parent_id: Optional[int] = -1 position: Optional[int] = -1 + def __init__( + self, + tree_id: int, + node_id: int, + left_child_id: Optional[int], + right_child_id: Optional[int], + cover: float, + is_leaf: bool, + default_left: bool, + feature: Optional[int], + value: Optional[float], + parent_id: Optional[int] = -1, + position: Optional[int] = -1, + ) -> None: + self.tree_id = tree_id + self.node_id = node_id + self.left_child_id = left_child_id + self.right_child_id = right_child_id + self.cover = cover + self.is_leaf = is_leaf + self.default_left = default_left + self.feature = feature + self.value = value + self.parent_id = parent_id + self.position = position + def get_value_closest_float_downward(self) -> np.float64: """Get the closest exact fp value smaller than self.value""" return np.nextafter(np.single(self.value), np.single(-np.inf)) @@ -51,7 +94,7 @@ def get_value_closest_float_downward(self) -> np.float64: class TreeView: """Helper class, treating a list of nodes as one tree""" - def __init__(self, tree_id: int, nodes: list[Node]) -> None: + def __init__(self, tree_id: int, nodes: List[Node]) -> None: self.tree_id = tree_id self.nodes = nodes self.n_nodes = len(nodes) @@ -68,7 +111,7 @@ def value(self) -> float: raise ValueError("Tree is leaf-only but leaf node has no value") return self.nodes[0].value - def get_children(self, node: Node) -> tuple[Node, Node]: + def get_children(self, node: Node) -> Tuple[Node, Node]: """Find children of the provided node""" children_ids = (node.left_child_id, node.right_child_id) selection = [n for n in self.nodes if n.node_id in children_ids] @@ -121,12 +164,12 @@ def iter_trees(self) -> Generator[TreeView, None, None]: for tid in tree_ids: yield TreeView(tree_id=tid, nodes=[n for n in self if n.tree_id == tid]) - def _fill_leaf_values(self, booster_dump: list[str]) -> None: + def _fill_leaf_values(self, booster_dump: List[str]) -> None: """Fill the leaf values (i.e. the predictions) from `booster_dump` Note: These values are not contained in the pd.DataFrame format""" def get_leaf_nodes( - node: Dict[str, Any], leaf_nodes: list[Dict[str, Any]] = [] + node: Dict[str, Any], leaf_nodes: List[Dict[str, Any]] = [] ) -> None: """Helper to get all leaf nodes from the json.loads() of the booster_dump""" if "children" in node: @@ -198,12 +241,6 @@ def get_catboost_params(booster): def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: - @dataclass - class LightGbmNode: - tree: Dict[str, Any] - parent_id: int - position: int - if lgb_model is None: lgb_model = get_lightgbm_params(model) @@ -347,8 +384,9 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: if is_regression: mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) + # TODO: Understand why this tree is added tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score) + mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) else: mb = gbt_clf_model_builder( n_features=n_features, n_iterations=n_iterations, n_classes=n_classes @@ -367,22 +405,24 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: class_label += 1 if tree.is_leaf: - mb.add_leaf(tree_id=tree_id, response=tree.value) + mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) continue root_node = tree.nodes[0] assert isinstance( root_node.feature, int ), f"Feature names must be integers (got ({type(root_node.feature)}){root_node.feature})" + parent_id = mb.add_split( tree_id=tree_id, feature_index=root_node.feature, feature_value=root_node.get_value_closest_float_downward(), + cover=root_node.cover, default_left=root_node.default_left, ) # create queue - node_queue: Deque[Node] = deque() + node_queue: Deque[NodeList.Node] = deque() children = tree.get_children(root_node) for position, child in enumerate(children): child.parent_id = parent_id @@ -398,6 +438,7 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: mb.add_leaf( tree_id=tree_id, response=node.value, + cover=node.cover, parent_id=node.parent_id, position=node.position, ) @@ -409,6 +450,7 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: tree_id=tree_id, feature_index=node.feature, feature_value=node.get_value_closest_float_downward(), + cover=node.cover, default_left=node.default_left, parent_id=node.parent_id, position=node.position, @@ -501,14 +543,6 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: (model_data["oblivious_trees"][tree_num], cur_tree_depth) ) else: - - @dataclass - class CatBoostNode: - split: Optional[float] = None - value: Optional[list[float]] = None - right: Optional[int] = None - left: Optional[int] = None - n_nodes = 1 # Check if node is a leaf (in case of stump) if "split" in model_data["trees"][tree_num]: diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index c906a0a537..0f7335da65 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -23,9 +23,9 @@ #include "onedal/version.hpp" #if (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) -#define _gbt_inference_has_missing_values_support 1 + #define _gbt_inference_has_missing_values_support 1 #else -#define _gbt_inference_has_missing_values_support 0 + #define _gbt_inference_has_missing_values_support 0 #endif typedef daal::algorithms::gbt::classification::ModelBuilder c_gbt_classification_model_builder; @@ -58,12 +58,12 @@ c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ #endif } -c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) +c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) { #if _gbt_inference_has_missing_values_support - return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft); + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, cover, defaultLeft); #else - return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue); + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, cover); #endif } diff --git a/src/gbt_model_builder.pyx b/src/gbt_model_builder.pyx index f46264ed94..0ea68a44f4 100644 --- a/src/gbt_model_builder.pyx +++ b/src/gbt_model_builder.pyx @@ -38,10 +38,10 @@ cdef extern from "gbt_model_builder.h": cdef cppclass c_gbt_regression_model_builder: c_gbt_regression_model_builder(size_t nFeatures, size_t nIterations) except + c_gbt_reg_tree_id createTree(size_t nNodes) - c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response) + c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover) cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) - cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) + cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) cdef class gbt_classification_model_builder: ''' @@ -65,7 +65,7 @@ cdef class gbt_classification_model_builder: ''' return self.c_ptr.createTree(n_nodes, class_label) - def add_leaf(self, c_gbt_clf_tree_id tree_id, double response, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): + def add_leaf(self, c_gbt_clf_tree_id tree_id, double response, double cover, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): ''' Create Leaf node and add it to certain tree @@ -73,11 +73,13 @@ cdef class gbt_classification_model_builder: :param node-handle parent_id: parent node to which new node is added (use noParent for root node) :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param double response: response value for leaf node to be predicted + :param double cover: cover (sum_hess) of the leaf node :rtype: node identifier ''' + # TODO: Forward cover to oneDAL return self.c_ptr.addLeafNode(tree_id, parent_id, position, response) - def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, int default_left, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): + def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, double cover, int default_left, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): ''' Create Split node and add it to certain tree. @@ -86,9 +88,11 @@ cdef class gbt_classification_model_builder: :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param size_t feature_index: feature index for spliting :param double feature_value: feature value for spliting + :param double cover: cover (sum_hess) of the solit node :param int default_left: default behaviour in case of missing value :rtype: node identifier ''' + # TODO: Forward cover to oneDAL return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left) def model(self): @@ -123,7 +127,7 @@ cdef class gbt_regression_model_builder: ''' return self.c_ptr.createTree(n_nodes) - def add_leaf(self, c_gbt_reg_tree_id tree_id, double response, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): + def add_leaf(self, c_gbt_reg_tree_id tree_id, double response, double cover, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): ''' Create Leaf node and add it to certain tree @@ -131,11 +135,12 @@ cdef class gbt_regression_model_builder: :param node-handle parent_id: parent node to which new node is added (use noParent for root node) :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param double response: response value for leaf node to be predicted + :param double cover: cover (sum_hess) of the leaf node :rtype: node identifier ''' - return self.c_ptr.addLeafNode(tree_id, parent_id, position, response) + return self.c_ptr.addLeafNode(tree_id, parent_id, position, response, cover) - def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, int default_left, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): + def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, double cover, int default_left, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): ''' Create Split node and add it to certain tree. @@ -144,10 +149,11 @@ cdef class gbt_regression_model_builder: :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param size_t feature_index: feature index for spliting :param double feature_value: feature value for spliting + :param double cover: cover (sum_hess) of the split node :param int default_left: default behaviour in case of missing value :rtype: node identifier ''' - return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left) + return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, cover, default_left) def model(self): ''' From e6e198413fa597bda3e5c59b8063d65b01a8a684 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 1 Aug 2023 08:41:33 -0700 Subject: [PATCH 07/64] simplify xgboost tree parser --- src/gbt_convertors.pyx | 145 ++++++++++++----------------------------- 1 file changed, 43 insertions(+), 102 deletions(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index bb58290f2a..c4b7e51455 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -48,18 +48,6 @@ class LightGbmNode: class Node: """Helper class holding Tree Node information""" - tree_id: int - node_id: int - left_child_id: Optional[int] - right_child_id: Optional[int] - cover: float - is_leaf: bool - default_left: bool - feature: Optional[int] - value: Optional[float] - parent_id: Optional[int] = -1 - position: Optional[int] = -1 - def __init__( self, tree_id: int, @@ -69,8 +57,8 @@ class Node: cover: float, is_leaf: bool, default_left: bool, - feature: Optional[int], - value: Optional[float], + feature: int, + value: float, parent_id: Optional[int] = -1, position: Optional[int] = -1, ) -> None: @@ -81,15 +69,23 @@ class Node: self.cover = cover self.is_leaf = is_leaf self.default_left = default_left - self.feature = feature - self.value = value self.parent_id = parent_id self.position = position + self.value = value + self.__feature = feature def get_value_closest_float_downward(self) -> np.float64: """Get the closest exact fp value smaller than self.value""" return np.nextafter(np.single(self.value), np.single(-np.inf)) + @property + def feature(self) -> int: + if not (isinstance(self.__feature, str) and self.__feature.isnumeric()): + raise ValueError( + f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" + ) + return int(self.__feature) + class TreeView: """Helper class, treating a list of nodes as one tree""" @@ -127,36 +123,37 @@ class NodeList(list): @staticmethod def from_booster(booster: xgb.Booster) -> "NodeList": - """Create a TreeList object from a xgb.Booster object""" - tl = NodeList() - df = booster.trees_to_dataframe() - for _, node in df.iterrows(): - tree_id, node_id = map(int, node["ID"].split("-")) # e.g. 0-1 - is_leaf = node["Feature"] == "Leaf" - left_child_id = ( - int(node["Yes"].split("-")[1]) if isinstance(node["Yes"], str) else None - ) - right_child_id = ( - int(node["No"].split("-")[1]) if isinstance(node["No"], str) else None - ) - tl.append( - Node( - tree_id=tree_id, - node_id=node_id, - left_child_id=left_child_id, - right_child_id=right_child_id, - cover=node["Cover"], - feature=int(node["Feature"]) if node["Feature"].isnumeric() else None, - is_leaf=is_leaf, - default_left=node["Yes"] == node["Missing"], - value=None if is_leaf else node["Split"], + nl = NodeList() + dump = booster.get_dump(dump_format="json", with_stats=True) + for tree_id, raw_tree in enumerate(dump): + nodes = deque() + nodes.append(json.loads(raw_tree)) + while nodes: + node = nodes.popleft() + if "children" in node: + left_child_id = node["children"][0]["nodeid"] + right_child_id = node["children"][1]["nodeid"] + nodes.append(node["children"][0]) + nodes.append(node["children"][1]) + else: + left_child_id, right_child_id = None, None + is_leaf = "leaf" in node + default_left = "yes" in node and node["yes"] == node["missing"] + nl.append( + Node( + tree_id=tree_id, + node_id=node["nodeid"], + left_child_id=left_child_id, + right_child_id=right_child_id, + cover=node["cover"], + feature=node.get("split"), + is_leaf=is_leaf, + default_left=default_left, + value=node["leaf"] if is_leaf else node["split_condition"], + ) ) - ) - - # fill the missing leaf values which are not part of the dataframe - tl._fill_leaf_values(booster.get_dump(dump_format="json")) - return tl + return nl def iter_trees(self) -> Generator[TreeView, None, None]: """Iterate over TreeViews""" @@ -164,55 +161,6 @@ class NodeList(list): for tid in tree_ids: yield TreeView(tree_id=tid, nodes=[n for n in self if n.tree_id == tid]) - def _fill_leaf_values(self, booster_dump: List[str]) -> None: - """Fill the leaf values (i.e. the predictions) from `booster_dump` - Note: These values are not contained in the pd.DataFrame format""" - - def get_leaf_nodes( - node: Dict[str, Any], leaf_nodes: List[Dict[str, Any]] = [] - ) -> None: - """Helper to get all leaf nodes from the json.loads() of the booster_dump""" - if "children" in node: - get_leaf_nodes(node["children"][0], leaf_nodes) - get_leaf_nodes(node["children"][1], leaf_nodes) - return - - if "leaf" not in node: - raise KeyError(f"Node does not have a 'leaf' value: {node}") - - leaf_nodes.append(node) - - root_nodes = [json.loads(s) for s in booster_dump] - - for tree_id, root_node in enumerate(root_nodes): - leaf_nodes = [] - get_leaf_nodes(root_node, leaf_nodes) - - for node in self: - if not node.is_leaf: - continue - - if node.tree_id != tree_id: - continue - - try: - node.value = float( - [ - l["leaf"] for l in leaf_nodes if l["nodeid"] == node.node_id - ].pop() - ) - except IndexError as e: - raise ValueError( - f"No leaf information for node {node.node_id} in tree {node.tree_id}" - ) from e - - # assert all tree leafs have a value - for node in self: - if node.is_leaf: - assert ( - node.value is not None - ), f"Failed to find leaf value for node {node}" - def __setitem__(self): raise NotImplementedError("Use TreeList.from_booster() to initialize a TreeList") @@ -380,11 +328,11 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: n_iterations = booster.best_iteration + 1 - # Create + base iteration + # Create if is_regression: mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) - # TODO: Understand why this tree is added + # add base score as the first tree tree_id = mb.create_tree(1) mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) else: @@ -409,10 +357,6 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: continue root_node = tree.nodes[0] - assert isinstance( - root_node.feature, int - ), f"Feature names must be integers (got ({type(root_node.feature)}){root_node.feature})" - parent_id = mb.add_split( tree_id=tree_id, feature_index=root_node.feature, @@ -443,9 +387,6 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: position=node.position, ) else: - assert isinstance( - node.feature, int - ), f"Feature names must be integers (got ({type(node.feature)}){node.feature})" parent_id = mb.add_split( tree_id=tree_id, feature_index=node.feature, From 87dc4ca073eadb8c2db9535824f256e6620d4e70 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 9 Aug 2023 10:39:32 -0700 Subject: [PATCH 08/64] Refactor gbt model parser for speed and add tests --- src/gbt_convertors-recursive-append.pyx | 755 ++++++++++++++++++++++++ src/gbt_convertors.pyx | 131 ++-- tests/test_xgboost_mb.py | 173 ++++-- 3 files changed, 948 insertions(+), 111 deletions(-) create mode 100644 src/gbt_convertors-recursive-append.pyx diff --git a/src/gbt_convertors-recursive-append.pyx b/src/gbt_convertors-recursive-append.pyx new file mode 100644 index 0000000000..95ea375348 --- /dev/null +++ b/src/gbt_convertors-recursive-append.pyx @@ -0,0 +1,755 @@ +# =============================================================================== +# Copyright 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import json +from collections import deque +from os import getpid, remove +from time import time +from typing import Any, Deque, Dict, Generator, List, Optional, Tuple + +import numpy as np +import xgboost as xgb + + +class CatBoostNode: + def __init__( + self, + split: Optional[float] = None, + value: Optional[List[float]] = None, + right: Optional[int] = None, + left: Optional[float] = None, + ) -> None: + self.split = split + self.value = value + self.right = right + self.left = left + + +class LightGbmNode: + def __init__(self, tree: Dict[str, Any], parent_id: int, position: int) -> None: + self.tree = tree + self.parent_id = parent_id + self.position = position + + +class Node: + """Helper class holding Tree Node information""" + + def __init__( + self, + node_id: int, + cover: float, + is_leaf: bool, + default_left: bool, + feature: int, + value: float, + n_children: int = 0, + left_child: "Optional[Node]" = None, + right_child: "Optional[Node]" = None, + parent_id: Optional[int] = -1, + position: Optional[int] = -1, + ) -> None: + self.node_id = node_id + self.cover = cover + self.is_leaf = is_leaf + self.default_left = default_left + self.__feature = feature + self.value = value + self.n_children = n_children + self.left_child = left_child + self.right_child = right_child + self.parent_id = parent_id + self.position = position + + @staticmethod + def from_dict(input_dict: Dict[str, Any]) -> "Node": + if "children" in input_dict: + left_child = Node.from_dict(input_dict["children"][0]) + right_child = Node.from_dict(input_dict["children"][1]) + n_children = 2 + left_child.n_children + right_child.n_children + else: + left_child = None + right_child = None + n_children = 0 + is_leaf = "leaf" in input_dict + default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"] + return Node( + node_id=input_dict["nodeid"], + cover=input_dict["cover"], + is_leaf=is_leaf, + default_left=default_left, + feature=input_dict.get("split"), + value=input_dict["leaf"] if is_leaf else input_dict["split_condition"], + n_children=n_children, + left_child=left_child, + right_child=right_child, + ) + + def get_value_closest_float_downward(self) -> np.float64: + """Get the closest exact fp value smaller than self.value""" + return np.nextafter(np.single(self.value), np.single(-np.inf)) + + def get_children(self) -> "Optional[Tuple[Node, Node]]": + if not self.left_child or not self.right_child: + assert self.is_leaf + else: + return (self.left_child, self.right_child) + + @property + def feature(self) -> int: + if not (isinstance(self.__feature, str) and self.__feature.isnumeric()): + raise ValueError( + f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" + ) + return int(self.__feature) + + +class TreeView: + """Helper class, treating a list of nodes as one tree""" + + def __init__(self, tree_id: int, root_node: Node) -> None: + self.tree_id = tree_id + self.root_node = root_node + + @property + def is_leaf(self) -> bool: + return self.root_node.is_leaf + + @property + def value(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + if not self.root_node.value: + raise ValueError("Tree is leaf-only but leaf node has no value") + return self.root_node.value + + @property + def cover(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + return self.root_node.cover + + @property + def n_nodes(self) -> int: + return self.root_node.n_children + 1 + + +class TreeList: + """Helper class that is able to extract all information required by the + model builders from an XGBoost.Booster object""" + + def __init__(self): + self.tree_views: List[TreeView] = [] + + @staticmethod + def from_booster(booster: xgb.Booster) -> "TreeList": + tl = TreeList() + dump = booster.get_dump(dump_format="json", with_stats=True) + for tree_id, raw_tree in enumerate(dump): + raw_tree_parsed = json.loads(raw_tree) + root_node = Node.from_dict(raw_tree_parsed) + tl.append(TreeView(tree_id=tree_id, root_node=root_node)) + + return tl + + def append(self, elem): + self.tree_views.append(elem) + + def __iter__(self) -> Generator[TreeView, None, None]: + """Iterate over TreeViews""" + for tree_view in self.tree_views: + yield tree_view + + def __setitem__(self): + raise NotImplementedError("Use TreeList.from_booster() to initialize a TreeList") + + +def get_lightgbm_params(booster): + return booster.dump_model() + + +def get_xgboost_params(booster): + return json.loads(booster.save_config()) + + +def get_catboost_params(booster): + dump_filename = f"catboost_model_{getpid()}_{time()}" + + # Dump model in file + booster.save_model(dump_filename, "json") + + # Read json with model + with open(dump_filename) as file: + model_data = json.load(file) + + # Delete dump file + remove(dump_filename) + return model_data + + +def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: + if lgb_model is None: + lgb_model = get_lightgbm_params(model) + + n_features = lgb_model["max_feature_idx"] + 1 + n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] + n_classes = lgb_model["num_tree_per_iteration"] + + is_regression = False + objective_fun = lgb_model["objective"] + if n_classes > 2: + if "multiclass" not in objective_fun: + raise TypeError( + "multiclass (softmax) objective is only supported for multiclass classification" + ) + elif "binary" in objective_fun: # nClasses == 1 + n_classes = 2 + else: + is_regression = True + + if is_regression: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + + class_label = 0 + iterations_counter = 0 + for tree in lgb_model["tree_info"]: + if is_regression: + tree_id = mb.create_tree(tree["num_leaves"] * 2 - 1) + else: + tree_id = mb.create_tree( + n_nodes=tree["num_leaves"] * 2 - 1, class_label=class_label + ) + + iterations_counter += 1 + if iterations_counter == n_iterations: + iterations_counter = 0 + class_label += 1 + sub_tree = tree["tree_structure"] + + # root is leaf + if "leaf_value" in sub_tree: + mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) + continue + + # add root + feat_val = sub_tree["threshold"] + if isinstance(feat_val, str): + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + default_left = int(sub_tree["default_left"]) + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=sub_tree["split_feature"], + feature_value=feat_val, + default_left=default_left, + ) + + # create stack + node_stack: List[LightGbmNode] = [ + LightGbmNode(sub_tree["left_child"], parent_id, 0), + LightGbmNode(sub_tree["right_child"], parent_id, 1), + ] + + # dfs through it + while node_stack: + sub_tree = node_stack[-1].tree + parent_id = node_stack[-1].parent_id + position = node_stack[-1].position + node_stack.pop() + + # current node is leaf + if "leaf_index" in sub_tree: + mb.add_leaf( + tree_id=tree_id, + response=sub_tree["leaf_value"], + parent_id=parent_id, + position=position, + ) + continue + + # current node is split + feat_val = sub_tree["threshold"] + if isinstance(feat_val, str): + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + default_left = int(sub_tree["default_left"]) + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=sub_tree["split_feature"], + feature_value=feat_val, + default_left=default_left, + parent_id=parent_id, + position=position, + ) + + # append children + node_stack.append(LightGbmNode(sub_tree["left_child"], parent_id, 0)) + node_stack.append(LightGbmNode(sub_tree["right_child"], parent_id, 1)) + + return mb.model() + + +def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: + # Release Note for XGBoost 1.5.0: Python interface now supports configuring + # constraints using feature names instead of feature indices. This also + # helps with pandas input with set feature names. + booster.feature_names = [str(i) for i in range(booster.num_features())] + + if xgb_config is None: + xgb_config = get_xgboost_params(booster) + + n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) + n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) + base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) + + is_regression = False + objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] + if n_classes > 2: + if objective_fun not in ["multi:softprob", "multi:softmax"]: + raise TypeError( + "multi:softprob and multi:softmax are only supported for multiclass classification" + ) + elif objective_fun.find("binary:") == 0: + if objective_fun in ["binary:logistic", "binary:logitraw"]: + n_classes = 2 + else: + raise TypeError( + "binary:logistic and binary:logitraw are only supported for binary classification" + ) + else: + is_regression = True + + n_iterations = booster.best_iteration + 1 + + # Create + if is_regression: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) + + # add base score as the first tree + tree_id = mb.create_tree(1) + mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + + class_label = 0 + node_list = TreeList.from_booster(booster) + for counter, tree in enumerate(node_list, start=1): + # find out the number of nodes in the tree + if is_regression: + tree_id = mb.create_tree(tree.n_nodes) + else: + tree_id = mb.create_tree(n_nodes=tree.n_nodes, class_label=class_label) + + if counter % n_iterations == 0: + class_label += 1 + + if tree.is_leaf: + mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) + continue + + # def append(node: Node): + # if node.is_leaf: + # assert node.parent_id != -1, "node.parent_id must not be -1" + # assert node.position != -1, "node.position must not be -1" + + # mb.add_leaf( + # tree_id=tree_id, + # response=node.value, + # cover=node.cover, + # parent_id=node.parent_id, + # position=node.position, + # ) + + # else: + # assert node.left_child, "Split node must have left child" + # assert node.right_child, "Split node must have right child" + + # parent_id = mb.add_split( + # tree_id=tree_id, + # feature_index=node.feature, + # feature_value=node.get_value_closest_float_downward(),<< + # cover=node.cover, + # default_left=node.default_left, + # ) + + # node.left_child.parent_id = parent_id + # node.left_child.position = 0 + # append(node.left_child) + + # node.right_child.parent_id = parent_id + # node.right_child.position = 1 + # append(node.right_child) + + # append(tree.root_node) + + root_node = tree.root_node + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=root_node.feature, + feature_value=root_node.get_value_closest_float_downward(), + cover=root_node.cover, + default_left=root_node.default_left, + ) + + # create queue + node_queue: Deque[Node] = deque() + children = root_node.get_children() + assert children is not None + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + while node_queue: + node = node_queue.popleft() + assert node.parent_id != -1, "node.parent_id must not be -1" + assert node.position != -1, "node.position must not be -1" + + if node.is_leaf: + mb.add_leaf( + tree_id=tree_id, + response=node.value, + cover=node.cover, + parent_id=node.parent_id, + position=node.position, + ) + else: + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=node.feature, + feature_value=node.get_value_closest_float_downward(), + cover=node.cover, + default_left=node.default_left, + parent_id=node.parent_id, + position=node.position, + ) + + children = node.get_children() + assert children is not None + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + return mb.model() + + +def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: + if not model.is_fitted(): + raise RuntimeError("Model should be fitted before exporting to daal4py.") + + if model_data is None: + model_data = get_catboost_params(model) + + if "categorical_features" in model_data["features_info"]: + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + + n_features = len(model_data["features_info"]["float_features"]) + + is_symmetric_tree = ( + model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] + == "SymmetricTree" + ) + + if is_symmetric_tree: + n_iterations = len(model_data["oblivious_trees"]) + else: + n_iterations = len(model_data["trees"]) + + n_classes = 0 + + if "class_params" in model_data["model_info"]: + is_classification = True + n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + else: + is_classification = False + mb = gbt_reg_model_builder(n_features, n_iterations) + + splits = [] + + # Create splits array (all splits are placed sequentially) + for feature in model_data["features_info"]["float_features"]: + if feature["borders"]: + for feature_border in feature["borders"]: + splits.append( + {"feature_index": feature["feature_index"], "value": feature_border} + ) + + if not is_classification: + bias = model_data["scale_and_bias"][1][0] / n_iterations + scale = model_data["scale_and_bias"][0] + else: + bias = 0 + scale = 1 + + trees_explicit = [] + tree_symmetric = [] + + if ( + model_data["model_info"]["params"]["data_processing_options"][ + "float_features_binarization" + ]["nan_mode"] + == "Min" + ): + default_left = 1 + else: + default_left = 0 + + for tree_num in range(n_iterations): + if is_symmetric_tree: + if model_data["oblivious_trees"][tree_num]["splits"] is not None: + # Tree has more than 1 node + cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) + else: + cur_tree_depth = 0 + + tree_symmetric.append( + (model_data["oblivious_trees"][tree_num], cur_tree_depth) + ) + else: + n_nodes = 1 + # Check if node is a leaf (in case of stump) + if "split" in model_data["trees"][tree_num]: + # Get number of trees and splits info via BFS + # Create queue + nodes_queue = [] + root_node = CatBoostNode( + split=splits[model_data["trees"][tree_num]["split"]["split_index"]] + ) + nodes_queue.append((model_data["trees"][tree_num], root_node)) + while nodes_queue: + cur_node_data, cur_node = nodes_queue.pop(0) + if "value" in cur_node_data: + if isinstance(cur_node_data["value"], list): + cur_node.value = [value for value in cur_node_data["value"]] + else: + cur_node.value = [cur_node_data["value"] * scale + bias] + else: + cur_node.split = splits[cur_node_data["split"]["split_index"]] + left_node = CatBoostNode() + right_node = CatBoostNode() + cur_node.left = left_node + cur_node.right = right_node + nodes_queue.append((cur_node_data["left"], left_node)) + nodes_queue.append((cur_node_data["right"], right_node)) + n_nodes += 2 + else: + root_node = CatBoostNode() + if is_classification and n_classes > 2: + root_node.value = [ + value * scale for value in model_data["trees"][tree_num]["value"] + ] + else: + root_node.value = [ + model_data["trees"][tree_num]["value"] * scale + bias + ] + trees_explicit.append((root_node, n_nodes)) + + tree_id = [] + class_label = 0 + count = 0 + + # Only 1 tree for each iteration in case of regression or binary classification + if not is_classification or n_classes == 2: + n_tree_each_iter = 1 + else: + n_tree_each_iter = n_classes + + # Create id for trees (for the right order in modelbuilder) + for i in range(n_iterations): + for c in range(n_tree_each_iter): + if is_symmetric_tree: + n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 + else: + n_nodes = trees_explicit[i][1] + + if is_classification and n_classes > 2: + tree_id.append(mb.create_tree(n_nodes, class_label)) + count += 1 + if count == n_iterations: + class_label += 1 + count = 0 + + elif is_classification: + tree_id.append(mb.create_tree(n_nodes, 0)) + else: + tree_id.append(mb.create_tree(n_nodes)) + + if is_symmetric_tree: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + cur_tree_info = tree_symmetric[i][0] + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + cur_tree_leaf_val = cur_tree_info["leaf_values"] + cur_tree_depth = tree_symmetric[i][1] + + if cur_tree_depth == 0: + mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) + else: + # One split used for the whole level + cur_level_split = splits[ + cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] + ] + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) + prev_level_nodes = [root_id] + + # Iterate over levels, splits in json are reversed (root split is the last) + for cur_level in range(cur_tree_depth - 2, -1, -1): + cur_level_nodes = [] + for cur_parent in prev_level_nodes: + cur_level_split = splits[ + cur_tree_info["splits"][cur_level]["split_index"] + ] + cur_left_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=0, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) + cur_right_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=1, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + ) + cur_level_nodes.append(cur_left_node) + cur_level_nodes.append(cur_right_node) + prev_level_nodes = cur_level_nodes + + # Different storing format for leaves + if not is_classification or n_classes == 2: + for last_level_node_num in range(len(prev_level_nodes)): + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num + 1] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + ) + else: + for last_level_node_num in range(len(prev_level_nodes)): + left_index = ( + 2 * last_level_node_num * n_tree_each_iter + class_label + ) + right_index = ( + 2 * last_level_node_num + 1 + ) * n_tree_each_iter + class_label + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[left_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[right_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + ) + else: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + root_node = trees_explicit[i][0] + + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + # Traverse tree via BFS and build tree with modelbuilder + if root_node.value is None: + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=root_node.split["feature_index"], + feature_value=root_node.split["value"], + default_left=default_left, + ) + nodes_queue = [(root_node, root_id)] + while nodes_queue: + cur_node, cur_node_id = nodes_queue.pop(0) + left_node = cur_node.left + # Check if node is a leaf + if left_node.value is None: + left_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=0, + feature_index=left_node.split["feature_index"], + feature_value=left_node.split["value"], + default_left=default_left, + ) + nodes_queue.append((left_node, left_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=left_node.value[class_label], + parent_id=cur_node_id, + position=0, + ) + right_node = cur_node.right + # Check if node is a leaf + if right_node.value is None: + right_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=1, + feature_index=right_node.split["feature_index"], + feature_value=right_node.split["value"], + default_left=default_left, + ) + nodes_queue.append((right_node, right_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_node.right.value[class_label], + parent_id=cur_node_id, + position=1, + ) + + else: + # Tree has only one node + mb.add_leaf( + tree_id=cur_tree_id, response=root_node.value[class_label] + ) + + return mb.model() diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index c4b7e51455..8e7b768a4f 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -50,34 +50,64 @@ class Node: def __init__( self, - tree_id: int, node_id: int, - left_child_id: Optional[int], - right_child_id: Optional[int], cover: float, is_leaf: bool, default_left: bool, feature: int, value: float, + n_children: int = 0, + left_child: "Optional[Node]" = None, + right_child: "Optional[Node]" = None, parent_id: Optional[int] = -1, position: Optional[int] = -1, ) -> None: - self.tree_id = tree_id self.node_id = node_id - self.left_child_id = left_child_id - self.right_child_id = right_child_id self.cover = cover self.is_leaf = is_leaf self.default_left = default_left + self.__feature = feature + self.value = value + self.n_children = n_children + self.left_child = left_child + self.right_child = right_child self.parent_id = parent_id self.position = position - self.value = value - self.__feature = feature + + @staticmethod + def from_dict(input_dict: Dict[str, Any]) -> "Node": + if "children" in input_dict: + left_child = Node.from_dict(input_dict["children"][0]) + right_child = Node.from_dict(input_dict["children"][1]) + n_children = 2 + left_child.n_children + right_child.n_children + else: + left_child = None + right_child = None + n_children = 0 + is_leaf = "leaf" in input_dict + default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"] + return Node( + node_id=input_dict["nodeid"], + cover=input_dict["cover"], + is_leaf=is_leaf, + default_left=default_left, + feature=input_dict.get("split"), + value=input_dict["leaf"] if is_leaf else input_dict["split_condition"], + n_children=n_children, + left_child=left_child, + right_child=right_child, + ) def get_value_closest_float_downward(self) -> np.float64: """Get the closest exact fp value smaller than self.value""" return np.nextafter(np.single(self.value), np.single(-np.inf)) + def get_children(self) -> "Optional[Tuple[Node, Node]]": + if not self.left_child or not self.right_child: + assert self.is_leaf + else: + return (self.left_child, self.right_child) + @property def feature(self) -> int: if not (isinstance(self.__feature, str) and self.__feature.isnumeric()): @@ -90,76 +120,47 @@ class Node: class TreeView: """Helper class, treating a list of nodes as one tree""" - def __init__(self, tree_id: int, nodes: List[Node]) -> None: + def __init__(self, tree_id: int, root_node: Node) -> None: self.tree_id = tree_id - self.nodes = nodes - self.n_nodes = len(nodes) + self.root_node = root_node @property def is_leaf(self) -> bool: - return len(self.nodes) == 1 and self.nodes[0].is_leaf + return self.root_node.is_leaf @property def value(self) -> float: if not self.is_leaf: raise ValueError("Tree is not a leaf-only tree") - if not self.nodes[0].value: + if not self.root_node.value: raise ValueError("Tree is leaf-only but leaf node has no value") - return self.nodes[0].value + return self.root_node.value + + @property + def cover(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + return self.root_node.cover - def get_children(self, node: Node) -> Tuple[Node, Node]: - """Find children of the provided node""" - children_ids = (node.left_child_id, node.right_child_id) - selection = [n for n in self.nodes if n.node_id in children_ids] - assert ( - len(selection) == 2 - ), f"Found {len(selection)} (!= 2) child nodes for node {node}" - return tuple(selection) + @property + def n_nodes(self) -> int: + return self.root_node.n_children + 1 -class NodeList(list): +class TreeList(list): """Helper class that is able to extract all information required by the model builders from an XGBoost.Booster object""" @staticmethod - def from_booster(booster: xgb.Booster) -> "NodeList": - nl = NodeList() + def from_booster(booster: xgb.Booster) -> "TreeList": + tl = TreeList() dump = booster.get_dump(dump_format="json", with_stats=True) for tree_id, raw_tree in enumerate(dump): - nodes = deque() - nodes.append(json.loads(raw_tree)) - while nodes: - node = nodes.popleft() - if "children" in node: - left_child_id = node["children"][0]["nodeid"] - right_child_id = node["children"][1]["nodeid"] - nodes.append(node["children"][0]) - nodes.append(node["children"][1]) - else: - left_child_id, right_child_id = None, None - is_leaf = "leaf" in node - default_left = "yes" in node and node["yes"] == node["missing"] - nl.append( - Node( - tree_id=tree_id, - node_id=node["nodeid"], - left_child_id=left_child_id, - right_child_id=right_child_id, - cover=node["cover"], - feature=node.get("split"), - is_leaf=is_leaf, - default_left=default_left, - value=node["leaf"] if is_leaf else node["split_condition"], - ) - ) - - return nl + raw_tree_parsed = json.loads(raw_tree) + root_node = Node.from_dict(raw_tree_parsed) + tl.append(TreeView(tree_id=tree_id, root_node=root_node)) - def iter_trees(self) -> Generator[TreeView, None, None]: - """Iterate over TreeViews""" - tree_ids = set((node.tree_id for node in self)) - for tid in tree_ids: - yield TreeView(tree_id=tid, nodes=[n for n in self if n.tree_id == tid]) + return tl def __setitem__(self): raise NotImplementedError("Use TreeList.from_booster() to initialize a TreeList") @@ -341,8 +342,8 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: ) class_label = 0 - node_list = NodeList.from_booster(booster) - for counter, tree in enumerate(node_list.iter_trees(), start=1): + node_list = TreeList.from_booster(booster) + for counter, tree in enumerate(node_list, start=1): # find out the number of nodes in the tree if is_regression: tree_id = mb.create_tree(tree.n_nodes) @@ -356,7 +357,7 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) continue - root_node = tree.nodes[0] + root_node = tree.root_node parent_id = mb.add_split( tree_id=tree_id, feature_index=root_node.feature, @@ -366,8 +367,9 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: ) # create queue - node_queue: Deque[NodeList.Node] = deque() - children = tree.get_children(root_node) + node_queue: Deque[Node] = deque() + children = root_node.get_children() + assert children is not None for position, child in enumerate(children): child.parent_id = parent_id child.position = position @@ -397,7 +399,8 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: position=node.position, ) - children = tree.get_children(node) + children = node.get_children() + assert children is not None for position, child in enumerate(children): child.parent_id = parent_id child.position = position diff --git a/tests/test_xgboost_mb.py b/tests/test_xgboost_mb.py index 60ab1b9bdd..8883935133 100644 --- a/tests/test_xgboost_mb.py +++ b/tests/test_xgboost_mb.py @@ -34,6 +34,68 @@ class XgboostModelBuilder(unittest.TestCase): + # @unittest.skipUnless( + # all( + # [ + # hasattr(d4p, "get_gbt_model_from_xgboost"), + # hasattr(d4p, "gbt_classification_prediction"), + # daal_check_version(((2021, "P", 1))), + # ] + # ), + # reason, + # ) + # @unittest.skipUnless( + # importlib.util.find_spec("xgboost") is not None, + # "xgboost library is not installed", + # ) + # def test_earlystop(self): + # import xgboost as xgb + + # num_classes = 3 + # X, y = make_classification( + # n_samples=1000, + # n_features=10, + # n_informative=3, + # n_classes=num_classes, + # random_state=42, + # ) + # X_train, X_test, y_train, y_test = train_test_split( + # X, y, test_size=0.3, random_state=42 + # ) + + # # training parameters setting + # params = { + # "n_estimators": 100, + # "max_bin": 256, + # "scale_pos_weight": 2, + # "lambda_l2": 1, + # "alpha": 0.9, + # "max_depth": 8, + # "num_leaves": 2**8, + # "verbosity": 0, + # "objective": "multi:softproba", + # "learning_rate": 0.3, + # "num_class": num_classes, + # "early_stopping_rounds": 5, + # } + + # xgb_clf = xgb.XGBClassifier(**params) + # xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + # booster = xgb_clf.get_booster() + + # xgb_prediction = xgb_clf.predict(X_test) + # xgb_proba = xgb_clf.predict_proba(X_test) + # xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test)) + + # daal_model = d4p.mb.convert_model(booster) + + # daal_prediction = daal_model.predict(X_test) + # daal_proba = daal_model.predict_proba(X_test) + # daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test)) + + # self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) + # self.assertTrue(np.allclose(xgb_proba, daal_proba)) + @unittest.skipUnless( all( [ @@ -48,53 +110,70 @@ class XgboostModelBuilder(unittest.TestCase): importlib.util.find_spec("xgboost") is not None, "xgboost library is not installed", ) - def test_earlystop(self): - import xgboost as xgb - - num_classes = 3 - X, y = make_classification( - n_samples=1000, - n_features=10, - n_informative=3, - n_classes=num_classes, - random_state=42, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.3, random_state=42 - ) - - # training parameters setting - params = { - "n_estimators": 100, - "max_bin": 256, - "scale_pos_weight": 2, - "lambda_l2": 1, - "alpha": 0.9, - "max_depth": 8, - "num_leaves": 2**8, - "verbosity": 0, - "objective": "multi:softproba", - "learning_rate": 0.3, - "num_class": num_classes, - "early_stopping_rounds": 5, - } - - xgb_clf = xgb.XGBClassifier(**params) - xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) - booster = xgb_clf.get_booster() - - xgb_prediction = xgb_clf.predict(X_test) - xgb_proba = xgb_clf.predict_proba(X_test) - xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test)) - - daal_model = d4p.mb.convert_model(booster) - - daal_prediction = daal_model.predict(X_test) - daal_proba = daal_model.predict_proba(X_test) - daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test)) - - self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) - self.assertTrue(np.allclose(xgb_proba, daal_proba)) + def test_model_from_booster(self): + class MockBooster: + def get_dump(self, *_, **kwargs): + # raw dump of 2 trees with a max depth of 1 + return [ + ' { "nodeid": 0, "depth": 0, "split": "1", "split_condition": 2, "yes": 1, "no": 2, "missing": 1 , "gain": 3, "cover": 4, "children": [\n { "nodeid": 1, "leaf": 5 , "cover": 6 }, \n { "nodeid": 2, "leaf": 7 , "cover":8 }\n ]}', + ' { "nodeid": 0, "leaf": 0.2 , "cover": 42 }', + ] + + mock = MockBooster() + result = d4p.TreeList.from_booster(mock) + self.assertEqual(len(result), 2) + + tree0 = result[0] + self.assertIsInstance(tree0, d4p.TreeView) + self.assertFalse(tree0.is_leaf) + with self.assertRaises(ValueError): + tree0.cover + with self.assertRaises(ValueError): + tree0.value + + self.assertIsInstance(tree0.root_node, d4p.Node) + + self.assertEqual(tree0.root_node.node_id, 0) + self.assertEqual(tree0.root_node.left_child.node_id, 1) + self.assertEqual(tree0.root_node.right_child.node_id, 2) + + self.assertEqual(tree0.root_node.cover, 4) + self.assertEqual(tree0.root_node.left_child.cover, 6) + self.assertEqual(tree0.root_node.right_child.cover, 8) + + self.assertFalse(tree0.root_node.is_leaf) + self.assertTrue(tree0.root_node.left_child.is_leaf) + self.assertTrue(tree0.root_node.right_child.is_leaf) + + self.assertTrue(tree0.root_node.default_left) + self.assertFalse(tree0.root_node.left_child.default_left) + self.assertFalse(tree0.root_node.right_child.default_left) + + self.assertEqual(tree0.root_node.feature, 1) + with self.assertRaises(ValueError): + tree0.root_node.left_child.feature + with self.assertRaises(ValueError): + tree0.root_node.right_child.feature + + self.assertEqual(tree0.root_node.value, 2) + self.assertEqual(tree0.root_node.left_child.value, 5) + self.assertEqual(tree0.root_node.right_child.value, 7) + + self.assertEqual(tree0.root_node.n_children, 2) + self.assertEqual(tree0.root_node.left_child.n_children, 0) + self.assertEqual(tree0.root_node.right_child.n_children, 0) + + self.assertIsNone(tree0.root_node.left_child.left_child) + self.assertIsNone(tree0.root_node.left_child.right_child) + self.assertIsNone(tree0.root_node.right_child.left_child) + self.assertIsNone(tree0.root_node.right_child.right_child) + + tree1 = result[1] + self.assertIsInstance(tree1, d4p.TreeView) + self.assertTrue(tree1.is_leaf) + self.assertEqual(tree1.n_nodes, 1) + self.assertEqual(tree1.cover, 42) + self.assertEqual(tree1.value, 0.2) if __name__ == "__main__": From 0aaa508d75377834c8f37d3ccbfa6e0eebc3d73a Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 11 Aug 2023 08:10:34 -0700 Subject: [PATCH 09/64] feat: provide pred_contribs/pred_interactions kwargs in GBT _predict_regression --- daal4py/mb/model_builders.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index ce7f82b2e3..abc5bac769 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -200,7 +200,9 @@ def _predict_classification(self, X, fptype, resultsToEvaluate): else: return predict_result.probabilities - def _predict_regression(self, X, fptype): + def _predict_regression( + self, X, fptype, pred_contribs=False, pred_interactions=False + ): if X.shape[1] != self.n_features_in_: raise ValueError("Shape of input is different from what was seen in `fit`") @@ -213,7 +215,11 @@ def _predict_regression(self, X, fptype): ) # Prediction - predict_algo = d4p.gbt_regression_prediction(fptype=fptype) + predict_algo = d4p.gbt_regression_prediction( + fptype=fptype, + predShapContributions=pred_contribs, + predShapInteractions=pred_interactions, + ) predict_result = predict_algo.compute(X, self.daal_model_) return predict_result.prediction.ravel() @@ -223,11 +229,15 @@ class GBTDAALModel(GBTDAALBaseModel): def __init__(self): pass - def predict(self, X): + def predict(self, X, pred_contribs=False, pred_interactions=False): fptype = getFPType(X) if self._is_regression: - return self._predict_regression(X, fptype) + return self._predict_regression(X, fptype, pred_contribs, pred_interactions) else: + if pred_contribs or pred_interactions: + raise NotImplementedError( + f"{'pred_contribs' if pred_contribs else 'pred_interactions'} is not implemented for classification models" + ) return self._predict_classification(X, fptype, "computeClassLabels") def predict_proba(self, X): From caab07555a0f68eb4b6bba19a1e8401302847885 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 11 Aug 2023 08:48:14 -0700 Subject: [PATCH 10/64] re-enable mb tests --- tests/test_xgboost_mb.py | 122 +++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/tests/test_xgboost_mb.py b/tests/test_xgboost_mb.py index 8883935133..9d1adf5ef0 100644 --- a/tests/test_xgboost_mb.py +++ b/tests/test_xgboost_mb.py @@ -34,67 +34,67 @@ class XgboostModelBuilder(unittest.TestCase): - # @unittest.skipUnless( - # all( - # [ - # hasattr(d4p, "get_gbt_model_from_xgboost"), - # hasattr(d4p, "gbt_classification_prediction"), - # daal_check_version(((2021, "P", 1))), - # ] - # ), - # reason, - # ) - # @unittest.skipUnless( - # importlib.util.find_spec("xgboost") is not None, - # "xgboost library is not installed", - # ) - # def test_earlystop(self): - # import xgboost as xgb - - # num_classes = 3 - # X, y = make_classification( - # n_samples=1000, - # n_features=10, - # n_informative=3, - # n_classes=num_classes, - # random_state=42, - # ) - # X_train, X_test, y_train, y_test = train_test_split( - # X, y, test_size=0.3, random_state=42 - # ) - - # # training parameters setting - # params = { - # "n_estimators": 100, - # "max_bin": 256, - # "scale_pos_weight": 2, - # "lambda_l2": 1, - # "alpha": 0.9, - # "max_depth": 8, - # "num_leaves": 2**8, - # "verbosity": 0, - # "objective": "multi:softproba", - # "learning_rate": 0.3, - # "num_class": num_classes, - # "early_stopping_rounds": 5, - # } - - # xgb_clf = xgb.XGBClassifier(**params) - # xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) - # booster = xgb_clf.get_booster() - - # xgb_prediction = xgb_clf.predict(X_test) - # xgb_proba = xgb_clf.predict_proba(X_test) - # xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test)) - - # daal_model = d4p.mb.convert_model(booster) - - # daal_prediction = daal_model.predict(X_test) - # daal_proba = daal_model.predict_proba(X_test) - # daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test)) - - # self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) - # self.assertTrue(np.allclose(xgb_proba, daal_proba)) + @unittest.skipUnless( + all( + [ + hasattr(d4p, "get_gbt_model_from_xgboost"), + hasattr(d4p, "gbt_classification_prediction"), + daal_check_version(((2021, "P", 1))), + ] + ), + reason, + ) + @unittest.skipUnless( + importlib.util.find_spec("xgboost") is not None, + "xgboost library is not installed", + ) + def test_earlystop(self): + import xgboost as xgb + + num_classes = 3 + X, y = make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_classes=num_classes, + random_state=42, + ) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.3, random_state=42 + ) + + # training parameters setting + params = { + "n_estimators": 100, + "max_bin": 256, + "scale_pos_weight": 2, + "lambda_l2": 1, + "alpha": 0.9, + "max_depth": 8, + "num_leaves": 2**8, + "verbosity": 0, + "objective": "multi:softproba", + "learning_rate": 0.3, + "num_class": num_classes, + "early_stopping_rounds": 5, + } + + xgb_clf = xgb.XGBClassifier(**params) + xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + booster = xgb_clf.get_booster() + + xgb_prediction = xgb_clf.predict(X_test) + xgb_proba = xgb_clf.predict_proba(X_test) + xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test)) + + daal_model = d4p.mb.convert_model(booster) + + daal_prediction = daal_model.predict(X_test) + daal_proba = daal_model.predict_proba(X_test) + daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test)) + + self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) + self.assertTrue(np.allclose(xgb_proba, daal_proba)) @unittest.skipUnless( all( From b457e110ef64a755af8fe8c7b7999872b1a265c7 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 12 Sep 2023 02:50:05 -0700 Subject: [PATCH 11/64] Return pred_interactions in correct shape --- daal4py/mb/model_builders.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index abc5bac769..f949cff25f 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -222,7 +222,12 @@ def _predict_regression( ) predict_result = predict_algo.compute(X, self.daal_model_) - return predict_result.prediction.ravel() + if pred_interactions: + return predict_result.prediction.ravel().reshape( + (-1, X.shape[1] + 1, X.shape[1] + 1) + ) + else: + return predict_result.prediction.ravel() class GBTDAALModel(GBTDAALBaseModel): From b2edef4b93942f0dcbd1b0d73c535eebc29d8e13 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 27 Sep 2023 06:07:32 -0700 Subject: [PATCH 12/64] clean up inference APIs and versioning --- src/gbt_model_builder.h | 20 +++++++++++++------- src/gbt_model_builder.pyx | 10 ++++------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index 0f7335da65..00e418417e 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -22,10 +22,12 @@ #include #include "onedal/version.hpp" -#if (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) - #define _gbt_inference_has_missing_values_support 1 +#if (((MAJOR_VERSION == 2024) && (MINOR_VERSION >= 1)) || (MAJOR_VERSION > 2024)) + #define _gbt_inference_api_version 2 +#elif (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) + #define _gbt_inference_api_version 1 #else - #define _gbt_inference_has_missing_values_support 0 + #define _gbt_inference_api_version 0 #endif typedef daal::algorithms::gbt::classification::ModelBuilder c_gbt_classification_model_builder; @@ -49,9 +51,11 @@ static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_bu return RAW()(obj_->getModel()); } -c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) +c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) { -#if _gbt_inference_has_missing_values_support +#if (_gbt_inference_api_version == 2) + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, cover, defaultLeft); +#elif (_gbt_inference_api_version == 1) return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft); #else return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue); @@ -60,10 +64,12 @@ c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) { -#if _gbt_inference_has_missing_values_support +#if (_gbt_inference_api_version == 2) return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, cover, defaultLeft); +#elif (_gbt_inference_api_version == 1) + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft); #else - return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, cover); + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue); #endif } diff --git a/src/gbt_model_builder.pyx b/src/gbt_model_builder.pyx index 0ea68a44f4..9f1fd4acb6 100644 --- a/src/gbt_model_builder.pyx +++ b/src/gbt_model_builder.pyx @@ -33,14 +33,14 @@ cdef extern from "gbt_model_builder.h": cdef cppclass c_gbt_classification_model_builder: c_gbt_classification_model_builder(size_t nFeatures, size_t nIterations, size_t nClasses) except + c_gbt_clf_tree_id createTree(size_t nNodes, size_t classLabel) - c_gbt_clf_node_id addLeafNode(c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response) + c_gbt_clf_node_id addLeafNode(c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) cdef cppclass c_gbt_regression_model_builder: c_gbt_regression_model_builder(size_t nFeatures, size_t nIterations) except + c_gbt_reg_tree_id createTree(size_t nNodes) c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover) - cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft) + cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) cdef class gbt_classification_model_builder: @@ -76,8 +76,7 @@ cdef class gbt_classification_model_builder: :param double cover: cover (sum_hess) of the leaf node :rtype: node identifier ''' - # TODO: Forward cover to oneDAL - return self.c_ptr.addLeafNode(tree_id, parent_id, position, response) + return self.c_ptr.addLeafNode(tree_id, parent_id, position, response, cover) def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, double cover, int default_left, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): ''' @@ -92,8 +91,7 @@ cdef class gbt_classification_model_builder: :param int default_left: default behaviour in case of missing value :rtype: node identifier ''' - # TODO: Forward cover to oneDAL - return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left) + return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, cover, default_left) def model(self): ''' From be077cb6d175f05ebf2f00e2c080fe49c478b529 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 27 Sep 2023 06:08:37 -0700 Subject: [PATCH 13/64] Fix SHAP interaction output shape --- daal4py/mb/model_builders.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index f949cff25f..a10b8f0d41 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -222,7 +222,9 @@ def _predict_regression( ) predict_result = predict_algo.compute(X, self.daal_model_) - if pred_interactions: + if pred_contribs: + return predict_result.prediction.ravel().reshape((-1, X.shape[1] + 1)) + elif pred_interactions: return predict_result.prediction.ravel().reshape( (-1, X.shape[1] + 1, X.shape[1] + 1) ) From bf28b084aaa35d86ea081f9d5278a10cdd2f83c5 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 27 Sep 2023 09:24:54 -0700 Subject: [PATCH 14/64] align tree clf/reg APIs --- src/gbt_model_builder.h | 8 ++++---- src/gbt_model_builder.pyx | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index 00e418417e..b26636387b 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -51,10 +51,10 @@ static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_bu return RAW()(obj_->getModel()); } -c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) +c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) { #if (_gbt_inference_api_version == 2) - return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, cover, defaultLeft); + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft, cover); #elif (_gbt_inference_api_version == 1) return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft); #else @@ -62,10 +62,10 @@ c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ #endif } -c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) +c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) { #if (_gbt_inference_api_version == 2) - return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, cover, defaultLeft); + return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft, cover); #elif (_gbt_inference_api_version == 1) return c_ptr->addSplitNode(treeId, parentId, position, featureIndex, featureValue, defaultLeft); #else diff --git a/src/gbt_model_builder.pyx b/src/gbt_model_builder.pyx index 9f1fd4acb6..e51b43fd32 100644 --- a/src/gbt_model_builder.pyx +++ b/src/gbt_model_builder.pyx @@ -40,8 +40,8 @@ cdef extern from "gbt_model_builder.h": c_gbt_reg_tree_id createTree(size_t nNodes) c_gbt_reg_node_id addLeafNode(c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover) - cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) - cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, double cover, int defaultLeft) + cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) + cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) cdef class gbt_classification_model_builder: ''' @@ -78,7 +78,7 @@ cdef class gbt_classification_model_builder: ''' return self.c_ptr.addLeafNode(tree_id, parent_id, position, response, cover) - def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, double cover, int default_left, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): + def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): ''' Create Split node and add it to certain tree. @@ -87,11 +87,11 @@ cdef class gbt_classification_model_builder: :param size_t position: position in parent (e.g. 0 for left and 1 for right child in a binary tree) :param size_t feature_index: feature index for spliting :param double feature_value: feature value for spliting - :param double cover: cover (sum_hess) of the solit node :param int default_left: default behaviour in case of missing value + :param double cover: cover (sum_hess) of the solit node :rtype: node identifier ''' - return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, cover, default_left) + return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover) def model(self): ''' @@ -138,7 +138,7 @@ cdef class gbt_regression_model_builder: ''' return self.c_ptr.addLeafNode(tree_id, parent_id, position, response, cover) - def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, double cover, int default_left, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): + def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): ''' Create Split node and add it to certain tree. @@ -151,7 +151,7 @@ cdef class gbt_regression_model_builder: :param int default_left: default behaviour in case of missing value :rtype: node identifier ''' - return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, cover, default_left) + return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover) def model(self): ''' From fdec26392682674109148588efdfa4358e887422 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 28 Sep 2023 07:02:37 -0700 Subject: [PATCH 15/64] update copyright --- src/gbt_convertors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index 8e7b768a4f..a8284369f7 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -1,5 +1,5 @@ # =============================================================================== -# Copyright 2020 Intel Corporation +# Copyright 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 26333a4c9d0f5db669d6f49c81d50bf58032ea20 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 29 Sep 2023 02:03:22 -0700 Subject: [PATCH 16/64] fix: remove loading xgb only for a type hint --- src/gbt_convertors.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index a8284369f7..3588a74e66 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -21,7 +21,6 @@ from time import time from typing import Any, Deque, Dict, Generator, List, Optional, Tuple import numpy as np -import xgboost as xgb class CatBoostNode: @@ -152,7 +151,12 @@ class TreeList(list): model builders from an XGBoost.Booster object""" @staticmethod - def from_booster(booster: xgb.Booster) -> "TreeList": + def from_booster(booster) -> "TreeList": + """ + Load a TreeList from an xgb.Booster object + Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, + therefore not type hint is added. + """ tl = TreeList() dump = booster.get_dump(dump_format="json", with_stats=True) for tree_id, raw_tree in enumerate(dump): From c3f463a2cb4c2aab0b7b638938a0dea71052e5b6 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 29 Sep 2023 09:36:21 -0700 Subject: [PATCH 17/64] Update LightGBM Model Builder for TreeView --- src/gbt_convertors.pyx | 363 ++++++++++++++++++++--------------------- 1 file changed, 175 insertions(+), 188 deletions(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index 3588a74e66..e93ea73b43 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -37,19 +37,11 @@ class CatBoostNode: self.left = left -class LightGbmNode: - def __init__(self, tree: Dict[str, Any], parent_id: int, position: int) -> None: - self.tree = tree - self.parent_id = parent_id - self.position = position - - class Node: """Helper class holding Tree Node information""" def __init__( self, - node_id: int, cover: float, is_leaf: bool, default_left: bool, @@ -61,7 +53,6 @@ class Node: parent_id: Optional[int] = -1, position: Optional[int] = -1, ) -> None: - self.node_id = node_id self.cover = cover self.is_leaf = is_leaf self.default_left = default_left @@ -74,10 +65,10 @@ class Node: self.position = position @staticmethod - def from_dict(input_dict: Dict[str, Any]) -> "Node": + def from_xgb_dict(input_dict: Dict[str, Any]) -> "Node": if "children" in input_dict: - left_child = Node.from_dict(input_dict["children"][0]) - right_child = Node.from_dict(input_dict["children"][1]) + left_child = Node.from_xgb_dict(input_dict["children"][0]) + right_child = Node.from_xgb_dict(input_dict["children"][1]) n_children = 2 + left_child.n_children + right_child.n_children else: left_child = None @@ -86,7 +77,6 @@ class Node: is_leaf = "leaf" in input_dict default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"] return Node( - node_id=input_dict["nodeid"], cover=input_dict["cover"], is_leaf=is_leaf, default_left=default_left, @@ -97,6 +87,42 @@ class Node: right_child=right_child, ) + @staticmethod + def from_lightgbm_dict(input_dict: Dict[str, Any]) -> "Node": + if "tree_structure" in input_dict: + tree = input_dict["tree_structure"] + else: + tree = input_dict + + n_children = 0 + if "left_child" in tree: + left_child = Node.from_lightgbm_dict(tree["left_child"]) + n_children += 1 + left_child.n_children + else: + left_child = None + if "right_child" in tree: + right_child = Node.from_lightgbm_dict(tree["right_child"]) + n_children += 1 + right_child.n_children + else: + right_child = None + + is_leaf = "leaf_value" in tree + empty_leaf = is_leaf and "leaf_count" not in tree + if is_leaf: + cover = tree["leaf_count"] + else: + cover = tree["internal_count"] + return Node( + cover=cover, + is_leaf=is_leaf, + default_left=is_leaf or tree["default_left"], + feature=tree.get("split_feature"), + value=tree["leaf_value"] if is_leaf else tree["threshold"], + n_children=n_children, + left_child=left_child, + right_child=right_child, + ) + def get_value_closest_float_downward(self) -> np.float64: """Get the closest exact fp value smaller than self.value""" return np.nextafter(np.single(self.value), np.single(-np.inf)) @@ -109,11 +135,13 @@ class Node: @property def feature(self) -> int: - if not (isinstance(self.__feature, str) and self.__feature.isnumeric()): - raise ValueError( - f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" - ) - return int(self.__feature) + if isinstance(self.__feature, int): + return self.__feature + if isinstance(self.__feature, str) and self.__feature.isnumeric(): + return int(self.__feature) + raise ValueError( + f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" + ) class TreeView: @@ -151,7 +179,7 @@ class TreeList(list): model builders from an XGBoost.Booster object""" @staticmethod - def from_booster(booster) -> "TreeList": + def from_xgb_booster(booster) -> "TreeList": """ Load a TreeList from an xgb.Booster object Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, @@ -161,193 +189,58 @@ class TreeList(list): dump = booster.get_dump(dump_format="json", with_stats=True) for tree_id, raw_tree in enumerate(dump): raw_tree_parsed = json.loads(raw_tree) - root_node = Node.from_dict(raw_tree_parsed) + root_node = Node.from_xgb_dict(raw_tree_parsed) tl.append(TreeView(tree_id=tree_id, root_node=root_node)) return tl - def __setitem__(self): - raise NotImplementedError("Use TreeList.from_booster() to initialize a TreeList") - - -def get_lightgbm_params(booster): - return booster.dump_model() - - -def get_xgboost_params(booster): - return json.loads(booster.save_config()) - - -def get_catboost_params(booster): - dump_filename = f"catboost_model_{getpid()}_{time()}" - - # Dump model in file - booster.save_model(dump_filename, "json") - - # Read json with model - with open(dump_filename) as file: - model_data = json.load(file) - - # Delete dump file - remove(dump_filename) - return model_data - - -def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: - if lgb_model is None: - lgb_model = get_lightgbm_params(model) - - n_features = lgb_model["max_feature_idx"] + 1 - n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] - n_classes = lgb_model["num_tree_per_iteration"] - - is_regression = False - objective_fun = lgb_model["objective"] - if n_classes > 2: - if "multiclass" not in objective_fun: - raise TypeError( - "multiclass (softmax) objective is only supported for multiclass classification" - ) - elif "binary" in objective_fun: # nClasses == 1 - n_classes = 2 - else: - is_regression = True - - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - - class_label = 0 - iterations_counter = 0 - for tree in lgb_model["tree_info"]: - if is_regression: - tree_id = mb.create_tree(tree["num_leaves"] * 2 - 1) - else: - tree_id = mb.create_tree( - n_nodes=tree["num_leaves"] * 2 - 1, class_label=class_label - ) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = tree["tree_structure"] + @staticmethod + def from_lightgbm_booster_dump(dump: Dict[str, Any]) -> "TreeList": + """ + Load a TreeList from a lgbm.Model object + Note: We cannot type-hint the the Model without loading lightgbm as dependency in pyx code, + therefore not type hint is added. + """ + tl = TreeList() + for tree_id, tree_dict in enumerate(dump["tree_info"]): + root_node = Node.from_lightgbm_dict(tree_dict) + tl.append(TreeView(tree_id=tree_id, root_node=root_node)) - # root is leaf - if "leaf_value" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) - continue + return tl - # add root - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, + def __setitem__(self): + raise NotImplementedError( + "Use TreeList.from_*() methods to initialize a TreeList" ) - # create stack - node_stack: List[LightGbmNode] = [ - LightGbmNode(sub_tree["left_child"], parent_id, 0), - LightGbmNode(sub_tree["right_child"], parent_id, 1), - ] - - # dfs through it - while node_stack: - sub_tree = node_stack[-1].tree - parent_id = node_stack[-1].parent_id - position = node_stack[-1].position - node_stack.pop() - - # current node is leaf - if "leaf_index" in sub_tree: - mb.add_leaf( - tree_id=tree_id, - response=sub_tree["leaf_value"], - parent_id=parent_id, - position=position, - ) - continue - - # current node is split - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, - parent_id=parent_id, - position=position, - ) - - # append children - node_stack.append(LightGbmNode(sub_tree["left_child"], parent_id, 0)) - node_stack.append(LightGbmNode(sub_tree["right_child"], parent_id, 1)) - - return mb.model() +def get_gbt_model_from_tree_list( + tree_list: TreeList, + n_iterations: int, + is_regression: bool, + n_features: int, + n_classes: int, + base_score: float, + add_base_score_as_tree: bool, +): + """Return a GBT Model from TreeList""" -def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: - # Release Note for XGBoost 1.5.0: Python interface now supports configuring - # constraints using feature names instead of feature indices. This also - # helps with pandas input with set feature names. - booster.feature_names = [str(i) for i in range(booster.num_features())] - - if xgb_config is None: - xgb_config = get_xgboost_params(booster) - - n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) - n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) - base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) - - is_regression = False - objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] - if n_classes > 2: - if objective_fun not in ["multi:softprob", "multi:softmax"]: - raise TypeError( - "multi:softprob and multi:softmax are only supported for multiclass classification" + if is_regression: + if add_base_score_as_tree: + mb = gbt_reg_model_builder( + n_features=n_features, n_iterations=n_iterations + 1 ) - elif objective_fun.find("binary:") == 0: - if objective_fun in ["binary:logistic", "binary:logitraw"]: - n_classes = 2 + tree_id = mb.create_tree(1) + mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) else: - raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification" - ) - else: - is_regression = True - - n_iterations = booster.best_iteration + 1 - - # Create - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) - - # add base score as the first tree - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) else: mb = gbt_clf_model_builder( n_features=n_features, n_iterations=n_iterations, n_classes=n_classes ) class_label = 0 - node_list = TreeList.from_booster(booster) - for counter, tree in enumerate(node_list, start=1): + for counter, tree in enumerate(tree_list, start=1): # find out the number of nodes in the tree if is_regression: tree_id = mb.create_tree(tree.n_nodes) @@ -410,9 +303,103 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: child.position = position node_queue.append(child) + print("return model") return mb.model() +def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any: + if booster is None: + booster = model.dump_model() + + n_features = booster["max_feature_idx"] + 1 + n_iterations = len(booster["tree_info"]) / booster["num_tree_per_iteration"] + n_classes = booster["num_tree_per_iteration"] + + is_regression = False + objective_fun = booster["objective"] + if n_classes > 2: + if "multiclass" not in objective_fun: + raise TypeError( + "multiclass (softmax) objective is only supported for multiclass classification" + ) + elif "binary" in objective_fun: # nClasses == 1 + n_classes = 2 + else: + is_regression = True + + tree_list = TreeList.from_lightgbm_booster_dump(booster) + + return get_gbt_model_from_tree_list( + tree_list, + n_iterations=n_iterations, + is_regression=is_regression, + n_features=n_features, + n_classes=n_classes, + base_score=0, + add_base_score_as_tree=False, + ) + + +def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: + # Release Note for XGBoost 1.5.0: Python interface now supports configuring + # constraints using feature names instead of feature indices. This also + # helps with pandas input with set feature names. + booster.feature_names = [str(i) for i in range(booster.num_features())] + + if xgb_config is None: + xgb_config = json.loads(booster.save_config()) + + n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) + n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) + base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) + + is_regression = False + objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] + if n_classes > 2: + if objective_fun not in ["multi:softprob", "multi:softmax"]: + raise TypeError( + "multi:softprob and multi:softmax are only supported for multiclass classification" + ) + elif objective_fun.find("binary:") == 0: + if objective_fun in ["binary:logistic", "binary:logitraw"]: + n_classes = 2 + else: + raise TypeError( + "binary:logistic and binary:logitraw are only supported for binary classification" + ) + else: + is_regression = True + + n_iterations = booster.best_iteration + 1 + + tree_list = TreeList.from_xgb_booster(booster) + + return get_gbt_model_from_tree_list( + tree_list, + n_iterations=n_iterations, + is_regression=is_regression, + n_features=n_features, + n_classes=n_classes, + base_score=base_score, + add_base_score_as_tree=True, + ) + + +def get_catboost_params(booster): + dump_filename = f"catboost_model_{getpid()}_{time()}" + + # Dump model in file + booster.save_model(dump_filename, "json") + + # Read json with model + with open(dump_filename) as file: + model_data = json.load(file) + + # Delete dump file + remove(dump_filename) + return model_data + + def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: if not model.is_fitted(): raise RuntimeError("Model should be fitted before exporting to daal4py.") From b4206d960ab49a0cad9543aef879c7ff9ab502c8 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 2 Oct 2023 07:52:32 -0700 Subject: [PATCH 18/64] chore: rename model builders test file and remove ancient version check --- ...odel_builder.py => test_model_builders.py} | 38 ------------------- 1 file changed, 38 deletions(-) rename tests/{test_logistic_regression_model_builder.py => test_model_builders.py} (79%) diff --git a/tests/test_logistic_regression_model_builder.py b/tests/test_model_builders.py similarity index 79% rename from tests/test_logistic_regression_model_builder.py rename to tests/test_model_builders.py index 3a28677743..5a89e0893b 100644 --- a/tests/test_logistic_regression_model_builder.py +++ b/tests/test_model_builders.py @@ -28,20 +28,9 @@ # second is minor+patch - 0110, # third item is status - B daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8])) -reason = str(((2021, "P", 1))) + " not supported in this library version " -reason += str(daal_version) class LogRegModelBuilder(unittest.TestCase): - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) def test_iris_with_intercept(self): X, y = load_iris(return_X_y=True) n_classes = 3 @@ -59,15 +48,6 @@ def test_iris_with_intercept(self): pred_sklearn = clf.predict(X) self.assertTrue(np.allclose(pred_daal, pred_sklearn)) - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) def test_iris_without_intercept(self): X, y = load_iris(return_X_y=True) n_classes = 3 @@ -85,15 +65,6 @@ def test_iris_without_intercept(self): pred_sklearn = clf.predict(X) self.assertTrue(np.allclose(pred_daal, pred_sklearn)) - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) def test_breast_cancer_with_intercept(self): X, y = load_breast_cancer(return_X_y=True) n_classes = 2 @@ -111,15 +82,6 @@ def test_breast_cancer_with_intercept(self): pred_sklearn = clf.predict(X) self.assertTrue(np.allclose(pred_daal, pred_sklearn)) - @unittest.skipUnless( - all( - [ - hasattr(d4p, "logistic_regression_model_builder"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) def test_breast_cancer_without_intercept(self): X, y = load_breast_cancer(return_X_y=True) n_classes = 2 From f86f72dbef516ab3a13fe8433adcd2cba781e4b5 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 2 Oct 2023 09:37:26 -0700 Subject: [PATCH 19/64] Start cleaning up model builder tests, fix some failing tests --- requirements-test.txt | 3 + src/gbt_convertors-recursive-append.pyx | 150 ++++-------------------- src/gbt_convertors.pyx | 70 ++++++----- tests/test_model_builders.py | 148 +++++++++++++++++++++-- 4 files changed, 208 insertions(+), 163 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 10d61ade83..b9bd3d05c3 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,3 +6,6 @@ scikit-learn==1.2.2 ; python_version == '3.8' scikit-learn==1.3.1 ; python_version >= '3.9' pandas==2.0.1 ; python_version == '3.8' pandas==2.1.1 ; python_version >= '3.9' +xgboost==1.7.6; python_version <= '3.9' +xgboost==2.0.0; python_version >= '3.10' +lightgbm==4.1.0 diff --git a/src/gbt_convertors-recursive-append.pyx b/src/gbt_convertors-recursive-append.pyx index 95ea375348..6ffce3277d 100644 --- a/src/gbt_convertors-recursive-append.pyx +++ b/src/gbt_convertors-recursive-append.pyx @@ -155,7 +155,12 @@ class TreeList: self.tree_views: List[TreeView] = [] @staticmethod - def from_booster(booster: xgb.Booster) -> "TreeList": + def from_booster(booster) -> "TreeList": + """ + Load a TreeList from an xgb.Booster object + Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, + therefore not type hint is added. + """ tl = TreeList() dump = booster.get_dump(dump_format="json", with_stats=True) for tree_id, raw_tree in enumerate(dump): @@ -220,92 +225,11 @@ def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: else: is_regression = True - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - class_label = 0 - iterations_counter = 0 - for tree in lgb_model["tree_info"]: - if is_regression: - tree_id = mb.create_tree(tree["num_leaves"] * 2 - 1) - else: - tree_id = mb.create_tree( - n_nodes=tree["num_leaves"] * 2 - 1, class_label=class_label - ) - - iterations_counter += 1 - if iterations_counter == n_iterations: - iterations_counter = 0 - class_label += 1 - sub_tree = tree["tree_structure"] - - # root is leaf - if "leaf_value" in sub_tree: - mb.add_leaf(tree_id=tree_id, response=sub_tree["leaf_value"]) - continue - - # add root - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, - ) - # create stack - node_stack: List[LightGbmNode] = [ - LightGbmNode(sub_tree["left_child"], parent_id, 0), - LightGbmNode(sub_tree["right_child"], parent_id, 1), - ] - - # dfs through it - while node_stack: - sub_tree = node_stack[-1].tree - parent_id = node_stack[-1].parent_id - position = node_stack[-1].position - node_stack.pop() - - # current node is leaf - if "leaf_index" in sub_tree: - mb.add_leaf( - tree_id=tree_id, - response=sub_tree["leaf_value"], - parent_id=parent_id, - position=position, - ) - continue + tree_list = ... - # current node is split - feat_val = sub_tree["threshold"] - if isinstance(feat_val, str): - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - default_left = int(sub_tree["default_left"]) - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=sub_tree["split_feature"], - feature_value=feat_val, - default_left=default_left, - parent_id=parent_id, - position=position, - ) - - # append children - node_stack.append(LightGbmNode(sub_tree["left_child"], parent_id, 0)) - node_stack.append(LightGbmNode(sub_tree["right_child"], parent_id, 1)) - - return mb.model() + return get_gbt_model_from_tree_list(tree_list, n_iterations=n_iterations, is_regression=is_regression, n_features=n_features, n_classes=n_classes, base_score=base_score, add_base_score_as_tree=False) def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: @@ -340,21 +264,28 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: n_iterations = booster.best_iteration + 1 - # Create - if is_regression: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) + tree_list = TreeList.from_booster(booster) + + return get_gbt_model_from_tree_list(tree_list, n_iterations=n_iterations, is_regression=is_regression, n_features=n_features, n_classes=n_classes, base_score=base_score, add_base_score_as_tree=True) + - # add base score as the first tree - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) +def get_gbt_model_from_tree_list(tree_list: TreeList, n_iterations: int, is_regression: bool, n_features: int, n_classes: int, base_score: float, add_base_score_as_tree: bool) + """Return a GBT Model from TreeList""" + + if is_regression: + if add_base_score_as_tree: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) + tree_id = mb.create_tree(1) + mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) + else: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) else: mb = gbt_clf_model_builder( n_features=n_features, n_iterations=n_iterations, n_classes=n_classes ) class_label = 0 - node_list = TreeList.from_booster(booster) - for counter, tree in enumerate(node_list, start=1): + for counter, tree in enumerate(tree_list, start=1): # find out the number of nodes in the tree if is_regression: tree_id = mb.create_tree(tree.n_nodes) @@ -368,41 +299,6 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) continue - # def append(node: Node): - # if node.is_leaf: - # assert node.parent_id != -1, "node.parent_id must not be -1" - # assert node.position != -1, "node.position must not be -1" - - # mb.add_leaf( - # tree_id=tree_id, - # response=node.value, - # cover=node.cover, - # parent_id=node.parent_id, - # position=node.position, - # ) - - # else: - # assert node.left_child, "Split node must have left child" - # assert node.right_child, "Split node must have right child" - - # parent_id = mb.add_split( - # tree_id=tree_id, - # feature_index=node.feature, - # feature_value=node.get_value_closest_float_downward(),<< - # cover=node.cover, - # default_left=node.default_left, - # ) - - # node.left_child.parent_id = parent_id - # node.left_child.position = 0 - # append(node.left_child) - - # node.right_child.parent_id = parent_id - # node.right_child.position = 1 - # append(node.right_child) - - # append(tree.root_node) - root_node = tree.root_node parent_id = mb.add_split( tree_id=tree_id, diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index e93ea73b43..b7484c7ad8 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -15,10 +15,10 @@ # =============================================================================== import json +import logging from collections import deque -from os import getpid, remove -from time import time -from typing import Any, Deque, Dict, Generator, List, Optional, Tuple +from tempfile import NamedTemporaryFile +from typing import Any, Deque, Dict, List, Optional, Tuple import numpy as np @@ -30,11 +30,13 @@ class CatBoostNode: value: Optional[List[float]] = None, right: Optional[int] = None, left: Optional[float] = None, + cover: Optional[float] = None, ) -> None: self.split = split self.value = value self.right = right self.left = left + self.cover = cover class Node: @@ -107,13 +109,8 @@ class Node: right_child = None is_leaf = "leaf_value" in tree - empty_leaf = is_leaf and "leaf_count" not in tree - if is_leaf: - cover = tree["leaf_count"] - else: - cover = tree["internal_count"] return Node( - cover=cover, + cover=tree["leaf_count"] if is_leaf else tree["internal_count"], is_leaf=is_leaf, default_left=is_leaf or tree["default_left"], feature=tree.get("split_feature"), @@ -159,7 +156,7 @@ class TreeView: def value(self) -> float: if not self.is_leaf: raise ValueError("Tree is not a leaf-only tree") - if not self.root_node.value: + if self.root_node.value is None: raise ValueError("Tree is leaf-only but leaf node has no value") return self.root_node.value @@ -214,6 +211,22 @@ class TreeList(list): ) +def get_lightgbm_params(booster): + return booster.dump_model() + + +def get_xgboost_params(booster): + return json.loads(booster.save_config()) + + +def get_catboost_params(booster): + with NamedTemporaryFile() as fp: + booster.save_model(fp.name, "json") + fp.seek(0) + model_data = json.load(fp) + return model_data + + def get_gbt_model_from_tree_list( tree_list: TreeList, n_iterations: int, @@ -303,7 +316,6 @@ def get_gbt_model_from_tree_list( child.position = position node_queue.append(child) - print("return model") return mb.model() @@ -347,7 +359,7 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: booster.feature_names = [str(i) for i in range(booster.num_features())] if xgb_config is None: - xgb_config = json.loads(booster.save_config()) + xgb_config = get_xgboost_params(booster) n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) @@ -385,21 +397,6 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: ) -def get_catboost_params(booster): - dump_filename = f"catboost_model_{getpid()}_{time()}" - - # Dump model in file - booster.save_model(dump_filename, "json") - - # Read json with model - with open(dump_filename) as file: - model_data = json.load(file) - - # Delete dump file - remove(dump_filename) - return model_data - - def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: if not model.is_fitted(): raise RuntimeError("Model should be fitted before exporting to daal4py.") @@ -566,6 +563,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: feature_index=cur_level_split["feature_index"], feature_value=cur_level_split["value"], default_left=default_left, + cover=0.0, ) prev_level_nodes = [root_id] @@ -583,6 +581,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: feature_index=cur_level_split["feature_index"], feature_value=cur_level_split["value"], default_left=default_left, + cover=0.0, ) cur_right_node = mb.add_split( tree_id=cur_tree_id, @@ -591,6 +590,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: feature_index=cur_level_split["feature_index"], feature_value=cur_level_split["value"], default_left=default_left, + cover=0.0, ) cur_level_nodes.append(cur_left_node) cur_level_nodes.append(cur_right_node) @@ -606,6 +606,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: + bias, parent_id=prev_level_nodes[last_level_node_num], position=0, + cover=0.0, ) mb.add_leaf( tree_id=cur_tree_id, @@ -614,6 +615,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: + bias, parent_id=prev_level_nodes[last_level_node_num], position=1, + cover=0.0, ) else: for last_level_node_num in range(len(prev_level_nodes)): @@ -628,12 +630,14 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: response=cur_tree_leaf_val[left_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=0, + cover=0.0, ) mb.add_leaf( tree_id=cur_tree_id, response=cur_tree_leaf_val[right_index] * scale + bias, parent_id=prev_level_nodes[last_level_node_num], position=1, + cover=0.0, ) else: for class_label in range(n_tree_each_iter): @@ -648,6 +652,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: feature_index=root_node.split["feature_index"], feature_value=root_node.split["value"], default_left=default_left, + cover=0.0, ) nodes_queue = [(root_node, root_id)] while nodes_queue: @@ -662,6 +667,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: feature_index=left_node.split["feature_index"], feature_value=left_node.split["value"], default_left=default_left, + cover=0.0, ) nodes_queue.append((left_node, left_node_id)) else: @@ -670,6 +676,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: response=left_node.value[class_label], parent_id=cur_node_id, position=0, + cover=0.0, ) right_node = cur_node.right # Check if node is a leaf @@ -681,6 +688,7 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: feature_index=right_node.split["feature_index"], feature_value=right_node.split["value"], default_left=default_left, + cover=0.0, ) nodes_queue.append((right_node, right_node_id)) else: @@ -689,12 +697,18 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: response=cur_node.right.value[class_label], parent_id=cur_node_id, position=1, + cover=0.0, ) else: # Tree has only one node mb.add_leaf( - tree_id=cur_tree_id, response=root_node.value[class_label] + tree_id=cur_tree_id, + response=root_node.value[class_label], + cover=0.0, ) + logging.warning( + "Models converted from CatBoost cannot be used for SHAP value calculation" + ) return mb.model() diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 5a89e0893b..d90de55f1c 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -16,18 +16,19 @@ import unittest +import catboost as cb +import lightgbm as lgbm import numpy as np -from sklearn.datasets import load_breast_cancer, load_iris +import xgboost as xgb +from sklearn.datasets import ( + load_breast_cancer, + load_iris, + make_classification, + make_regression, +) from sklearn.linear_model import LogisticRegression import daal4py as d4p -from daal4py import _get__daal_link_version__ as dv -from daal4py.sklearn._utils import daal_check_version - -# First item is major version - 2021, -# second is minor+patch - 0110, -# third item is status - B -daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8])) class LogRegModelBuilder(unittest.TestCase): @@ -100,5 +101,136 @@ def test_breast_cancer_without_intercept(self): self.assertTrue(np.allclose(pred_daal, pred_sklearn)) +class XGBoostRegressionModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.X, cls.y = make_regression(n_samples=2, n_features=10, random_state=42) + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + cls.xgb_model = xgb.XGBRegressor(max_depth=5, n_estimators=50, random_state=42) + cls.xgb_model.fit(cls.X, cls.y) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + self.assertEqual(m.n_classes_, 0) + self.assertEqual(m.n_features_in_, 10) + self.assertTrue(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X) + xgboost_pred = self.xgb_model.predict(self.X) + self.assertTrue( + np.allclose(d4p_pred, xgboost_pred, atol=1e-7), + f"d4p and reference prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", + ) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X_nan) + xgboost_pred = self.xgb_model.predict(self.X_nan) + self.assertTrue( + np.allclose(d4p_pred, xgboost_pred, atol=1e-7), + f"d4p and reference missing value prediction different (d4p - ref) = {d4p_pred - xgboost_pred}", + ) + + def test_model_predict_shap_contribs(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + d4p_pred = m.predict(self.X, pred_contribs=True) + xgboost_pred = booster.predict( + xgb.DMatrix(self.X), + pred_contribs=True, + approx_contribs=False, + validate_features=False, + ) + self.assertTrue( + d4p_pred.shape == xgboost_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {xgboost_pred.shape}", + ) + self.assertTrue( + np.allclose(d4p_pred, xgboost_pred, atol=1e-7), + f"d4p and reference SHAP contribution prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", + ) + + def test_model_predict_shap_interactions(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + d4p_pred = m.predict(self.X, pred_interactions=True) + xgboost_pred = booster.predict( + xgb.DMatrix(self.X), + pred_interactions=True, + approx_contribs=False, + validate_features=False, + ) + self.assertTrue( + d4p_pred.shape == xgboost_pred.shape, + f"d4p and reference SHAP interaction shape is different {d4p_pred.shape} != {xgboost_pred.shape}", + ) + self.assertTrue( + np.allclose(d4p_pred, xgboost_pred, atol=1e-7), + f"d4p and reference SHAP interaction prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", + ) + + def test_model_predict_shap_contribs_missing_values(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + d4p_pred = m.predict(self.X_nan, pred_contribs=True) + xgboost_pred = booster.predict( + xgb.DMatrix(self.X_nan), + pred_contribs=True, + approx_contribs=False, + validate_features=False, + ) + self.assertTrue( + np.allclose(d4p_pred, xgboost_pred, atol=1e-7), + f"d4p and reference SHAP contribution missing value prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", + ) + + +class XGBoostClassificationModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.X, cls.y = make_classification(n_samples=500, n_features=10, random_state=42) + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + cls.xgb_model = xgb.XGBClassifier(max_depth=5, n_estimators=50, random_state=42) + cls.xgb_model.fit(cls.X, cls.y) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + self.assertEqual(m.n_classes_, 2) + self.assertEqual(m.n_features_in_, 10) + self.assertFalse(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X) + xgboost_pred = self.xgb_model.predict(self.X) + self.assertTrue( + np.allclose(d4p_pred, xgboost_pred, atol=1e-7), + f"d4p and reference prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", + ) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict(self.X_nan) + xgboost_pred = self.xgb_model.predict(self.X_nan) + self.assertTrue( + np.allclose(d4p_pred, xgboost_pred, atol=1e-7), + f"d4p and reference missing value prediction different (d4p - ref) = {d4p_pred - xgboost_pred}", + ) + + def test_model_predict_shap_contribs(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + with self.assertRaises(NotImplementedError): + m.predict(self.X, pred_contribs=True) + + def test_model_predict_shap_interactions(self): + booster = self.xgb_model.get_booster() + m = d4p.mb.convert_model(booster) + with self.assertRaises(NotImplementedError): + m.predict(self.X, pred_contribs=True) + + if __name__ == "__main__": unittest.main() From 4b4620f8a1f6c36e2b410b6c20d5fc72ef528168 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 4 Oct 2023 03:27:40 -0700 Subject: [PATCH 20/64] Add exhaustive model builder testing --- tests/test_model_builders.py | 261 +++++++++++++++++++++++++++++++++-- 1 file changed, 247 insertions(+), 14 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index d90de55f1c..4c4050bc78 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -19,6 +19,7 @@ import catboost as cb import lightgbm as lgbm import numpy as np +import shap import xgboost as xgb from sklearn.datasets import ( load_breast_cancer, @@ -104,21 +105,24 @@ def test_breast_cancer_without_intercept(self): class XGBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): - cls.X, cls.y = make_regression(n_samples=2, n_features=10, random_state=42) + X, y = make_regression(n_samples=2, n_features=10, random_state=42) + cls.X_test = X[:2, :] cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) cls.xgb_model = xgb.XGBRegressor(max_depth=5, n_estimators=50, random_state=42) - cls.xgb_model.fit(cls.X, cls.y) + cls.xgb_model.fit(X, y) def test_model_conversion(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) + # XGBoost treats regression as 0 classes, LightGBM 1 class + # For us, it does not make a difference and both are acceptable self.assertEqual(m.n_classes_, 0) self.assertEqual(m.n_features_in_, 10) self.assertTrue(m._is_regression) def test_model_predict(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) - d4p_pred = m.predict(self.X) - xgboost_pred = self.xgb_model.predict(self.X) + d4p_pred = m.predict(self.X_test) + xgboost_pred = self.xgb_model.predict(self.X_test) self.assertTrue( np.allclose(d4p_pred, xgboost_pred, atol=1e-7), f"d4p and reference prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", @@ -136,9 +140,9 @@ def test_missing_value_support(self): def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) - d4p_pred = m.predict(self.X, pred_contribs=True) + d4p_pred = m.predict(self.X_test, pred_contribs=True) xgboost_pred = booster.predict( - xgb.DMatrix(self.X), + xgb.DMatrix(self.X_test), pred_contribs=True, approx_contribs=False, validate_features=False, @@ -155,9 +159,9 @@ def test_model_predict_shap_contribs(self): def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) - d4p_pred = m.predict(self.X, pred_interactions=True) + d4p_pred = m.predict(self.X_test, pred_interactions=True) xgboost_pred = booster.predict( - xgb.DMatrix(self.X), + xgb.DMatrix(self.X_test), pred_interactions=True, approx_contribs=False, validate_features=False, @@ -190,10 +194,11 @@ def test_model_predict_shap_contribs_missing_values(self): class XGBoostClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): - cls.X, cls.y = make_classification(n_samples=500, n_features=10, random_state=42) + X, y = make_classification(n_samples=500, n_features=10, random_state=42) + cls.X_test = X[:2, :] cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) cls.xgb_model = xgb.XGBClassifier(max_depth=5, n_estimators=50, random_state=42) - cls.xgb_model.fit(cls.X, cls.y) + cls.xgb_model.fit(X, y) def test_model_conversion(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) @@ -203,8 +208,8 @@ def test_model_conversion(self): def test_model_predict(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) - d4p_pred = m.predict(self.X) - xgboost_pred = self.xgb_model.predict(self.X) + d4p_pred = m.predict(self.X_test) + xgboost_pred = self.xgb_model.predict(self.X_test) self.assertTrue( np.allclose(d4p_pred, xgboost_pred, atol=1e-7), f"d4p and reference prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", @@ -223,13 +228,241 @@ def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) with self.assertRaises(NotImplementedError): - m.predict(self.X, pred_contribs=True) + m.predict(self.X_test, pred_contribs=True) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) with self.assertRaises(NotImplementedError): - m.predict(self.X, pred_contribs=True) + m.predict(self.X_test, pred_contribs=True) + + +class LightGBMRegressionModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_regression(n_samples=100, n_features=10, random_state=42) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + params = { + "task": "train", + "boosting": "gbdt", + "objective": "regression", + "num_leaves": 10, + "learning_rage": 0.05, + "metric": {"l2", "l1"}, + "verbose": -1, + } + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X, y)) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.lgbm_model) + # XGBoost treats regression as 0 classes, LightGBM 1 class + # For us, it does not make a difference and both are acceptable + self.assertEqual(m.n_classes_, 1) + self.assertEqual(m.n_features_in_, 10) + self.assertTrue(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = self.lgbm_model.predict(self.X_test) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = self.lgbm_model.predict(self.X_nan) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + def test_model_predict_shap_contribs(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test, pred_contribs=True) + lgbm_pred = self.lgbm_model.predict(self.X_test, pred_contrib=True) + self.assertTrue( + d4p_pred.shape == lgbm_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + ) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + def test_model_predict_shap_interactions(self): + m = d4p.mb.convert_model(self.lgbm_model) + # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column + d4p_pred = m.predict(self.X_test, pred_interactions=True)[:, :-1, :-1] + explainer = shap.TreeExplainer(self.lgbm_model) + shap_pred = explainer.shap_interaction_values(self.X_test) + self.assertTrue( + d4p_pred.shape == shap_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {shap_pred.shape}", + ) + max_diff = np.absolute(d4p_pred - shap_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + def test_model_predict_shap_contribs_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan, pred_contribs=True) + lgbm_pred = self.lgbm_model.predict(self.X_nan, pred_contrib=True) + self.assertTrue( + d4p_pred.shape == lgbm_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + ) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + +class LightGBMClassificationModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_classification( + random_state=3, n_classes=3, n_informative=3, n_features=10 + ) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + params = { + "n_estimators": 10, + "task": "train", + "boosting": "gbdt", + "objective": "multiclass", + "num_leaves": 4, + "num_class": 3, + "verbose": -1, + } + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X, y)) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.lgbm_model) + self.assertEqual(m.n_classes_, 3) + self.assertEqual(m.n_features_in_, 10) + self.assertTrue(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1) + self.assertTrue((d4p_pred == lgbm_pred).all()) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1) + self.assertTrue((d4p_pred == lgbm_pred).all()) + + def test_model_predict_shap_contribs(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_contribs=True) + + def test_model_predict_shap_interactions(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_interactions == True) + + def test_model_predict_shap_contribs_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_nan, pred_contribs=True) + + +class CatBoostRegressionModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_regression(n_samples=100, n_features=10, random_state=42) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + params = { + "reg_lambda": 1, + "max_depth": 3, + "num_leaves": 2**3, + "verbose": 0, + "objective": "RMSE", + "learning_rate": 0.3, + "n_estimators": 25, + } + cls.cb_model = cb.CatBoost(params) + cls.cb_model.fit(X, y, verbose=0) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.cb_model) + self.assertTrue(hasattr(m, "daal_model_")) + self.assertIsInstance(m.daal_model_, d4p._daal4py.gbt_regression_model) + self.assertEqual(m.daal_model_.NumberOfFeatures, 10) + self.assertEqual(m.daal_model_.NumberOfTrees, 25) + self.assertEqual(m.n_features_in_, 10) + self.assertTrue(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = self.cb_model.predict(self.X_test) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = self.cb_model.predict(self.X_nan) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + def test_model_predict_shap_contribs(self): + # SHAP value support from CatBoost models is to be added + with self.assertWarnsRegex( + Warning, + "Models converted from CatBoost cannot be used for SHAP value calculation", + ): + d4p.mb.convert_model(self.cb_model) + + +class CatBoostClassificationModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_classification( + n_classes=3, n_features=10, n_informative=3, random_state=42 + ) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + params = { + "reg_lambda": 1, + "max_depth": 3, + "num_leaves": 2**3, + "verbose": 0, + "objective": "MultiClass", + "learning_rate": 0.3, + "n_estimators": 25, + } + cls.cb_model = cb.CatBoost(params) + cls.cb_model.fit(X, y, verbose=0) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.cb_model) + self.assertTrue(hasattr(m, "daal_model_")) + self.assertIsInstance(m.daal_model_, d4p._daal4py.gbt_classification_model) + self.assertEqual(m.daal_model_.NumberOfFeatures, 10) + self.assertEqual(m.daal_model_.NumberOfTrees, 3 * 25) + self.assertEqual(m.n_features_in_, 10) + self.assertFalse(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_test) + cb_pred = self.cb_model.predict(self.X_test, prediction_type="Class").T[0] + self.assertTrue((d4p_pred == cb_pred).all()) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.cb_model) + d4p_pred = m.predict(self.X_nan) + cb_pred = self.cb_model.predict(self.X_nan, prediction_type="Class").T[0] + self.assertTrue((d4p_pred == cb_pred).all()) + + def test_model_predict_shap_contribs(self): + # SHAP value support from CatBoost models is to be added + with self.assertWarnsRegex( + Warning, + "Models converted from CatBoost cannot be used for SHAP value calculation", + ): + d4p.mb.convert_model(self.cb_model) if __name__ == "__main__": From d5e0c0c86e698f8f5a339b60aa31cdd6ea4ab036 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 4 Oct 2023 03:36:39 -0700 Subject: [PATCH 21/64] chore: merge test_xgboost_mb.py and test_model_builders.py --- tests/test_model_builders.py | 118 +++++++++++++++++++++++ tests/test_xgboost_mb.py | 180 ----------------------------------- 2 files changed, 118 insertions(+), 180 deletions(-) delete mode 100644 tests/test_xgboost_mb.py diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 4c4050bc78..c84dc28a73 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -28,6 +28,7 @@ make_regression, ) from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split import daal4py as d4p @@ -465,5 +466,122 @@ def test_model_predict_shap_contribs(self): d4p.mb.convert_model(self.cb_model) +class XGBoostEarlyStopping(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + num_classes = 3 + X, y = make_classification( + n_samples=1500, + n_features=10, + n_informative=3, + n_classes=num_classes, + random_state=42, + ) + X_train, cls.X_test, y_train, cls.y_test = train_test_split( + X, y, test_size=0.5, random_state=42 + ) + + # training parameters setting + params = { + "n_estimators": 100, + "max_bin": 256, + "scale_pos_weight": 2, + "lambda_l2": 1, + "alpha": 0.9, + "max_depth": 8, + "num_leaves": 2**8, + "verbosity": 0, + "objective": "multi:softproba", + "learning_rate": 0.3, + "num_class": num_classes, + "early_stopping_rounds": 5, + } + + cls.xgb_clf = xgb.XGBClassifier(**params) + cls.xgb_clf.fit(X_train, y_train, eval_set=[(cls.X_test, cls.y_test)]) + + def test_early_stopping(self): + xgb_prediction = self.xgb_clf.predict(self.X_test) + xgb_proba = self.xgb_clf.predict_proba(self.X_test) + xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(self.y_test)) + + booster = self.xgb_clf.get_booster() + daal_model = d4p.mb.convert_model(booster) + daal_prediction = daal_model.predict(self.X_test) + daal_proba = daal_model.predict_proba(self.X_test) + daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(self.y_test)) + + self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) + max_diff = np.absolute(xgb_proba - daal_proba).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) + + +class ModelBuilderTreeView(unittest.TestCase): + def test_model_from_booster(self): + class MockBooster: + def get_dump(self, *_, **kwargs): + # raw dump of 2 trees with a max depth of 1 + return [ + ' { "nodeid": 0, "depth": 0, "split": "1", "split_condition": 2, "yes": 1, "no": 2, "missing": 1 , "gain": 3, "cover": 4, "children": [\n { "nodeid": 1, "leaf": 5 , "cover": 6 }, \n { "nodeid": 2, "leaf": 7 , "cover":8 }\n ]}', + ' { "nodeid": 0, "leaf": 0.2 , "cover": 42 }', + ] + + mock = MockBooster() + result = d4p.TreeList.from_booster(mock) + self.assertEqual(len(result), 2) + + tree0 = result[0] + self.assertIsInstance(tree0, d4p.TreeView) + self.assertFalse(tree0.is_leaf) + with self.assertRaises(ValueError): + tree0.cover + with self.assertRaises(ValueError): + tree0.value + + self.assertIsInstance(tree0.root_node, d4p.Node) + + self.assertEqual(tree0.root_node.node_id, 0) + self.assertEqual(tree0.root_node.left_child.node_id, 1) + self.assertEqual(tree0.root_node.right_child.node_id, 2) + + self.assertEqual(tree0.root_node.cover, 4) + self.assertEqual(tree0.root_node.left_child.cover, 6) + self.assertEqual(tree0.root_node.right_child.cover, 8) + + self.assertFalse(tree0.root_node.is_leaf) + self.assertTrue(tree0.root_node.left_child.is_leaf) + self.assertTrue(tree0.root_node.right_child.is_leaf) + + self.assertTrue(tree0.root_node.default_left) + self.assertFalse(tree0.root_node.left_child.default_left) + self.assertFalse(tree0.root_node.right_child.default_left) + + self.assertEqual(tree0.root_node.feature, 1) + with self.assertRaises(ValueError): + tree0.root_node.left_child.feature + with self.assertRaises(ValueError): + tree0.root_node.right_child.feature + + self.assertEqual(tree0.root_node.value, 2) + self.assertEqual(tree0.root_node.left_child.value, 5) + self.assertEqual(tree0.root_node.right_child.value, 7) + + self.assertEqual(tree0.root_node.n_children, 2) + self.assertEqual(tree0.root_node.left_child.n_children, 0) + self.assertEqual(tree0.root_node.right_child.n_children, 0) + + self.assertIsNone(tree0.root_node.left_child.left_child) + self.assertIsNone(tree0.root_node.left_child.right_child) + self.assertIsNone(tree0.root_node.right_child.left_child) + self.assertIsNone(tree0.root_node.right_child.right_child) + + tree1 = result[1] + self.assertIsInstance(tree1, d4p.TreeView) + self.assertTrue(tree1.is_leaf) + self.assertEqual(tree1.n_nodes, 1) + self.assertEqual(tree1.cover, 42) + self.assertEqual(tree1.value, 0.2) + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_xgboost_mb.py b/tests/test_xgboost_mb.py deleted file mode 100644 index 9d1adf5ef0..0000000000 --- a/tests/test_xgboost_mb.py +++ /dev/null @@ -1,180 +0,0 @@ -# ============================================================================== -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import importlib.util -import unittest - -import numpy as np -from sklearn.datasets import make_classification -from sklearn.model_selection import train_test_split - -import daal4py as d4p -from daal4py import _get__daal_link_version__ as dv -from daal4py.sklearn._utils import daal_check_version - -# First item is major version - 2021, -# second is minor+patch - 0110, -# third item is status - B -daal_version = (int(dv()[0:4]), dv()[10:11], int(dv()[4:8])) -reason = str(((2021, "P", 1))) + " not supported in this library version " -reason += str(daal_version) - - -class XgboostModelBuilder(unittest.TestCase): - @unittest.skipUnless( - all( - [ - hasattr(d4p, "get_gbt_model_from_xgboost"), - hasattr(d4p, "gbt_classification_prediction"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) - @unittest.skipUnless( - importlib.util.find_spec("xgboost") is not None, - "xgboost library is not installed", - ) - def test_earlystop(self): - import xgboost as xgb - - num_classes = 3 - X, y = make_classification( - n_samples=1000, - n_features=10, - n_informative=3, - n_classes=num_classes, - random_state=42, - ) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.3, random_state=42 - ) - - # training parameters setting - params = { - "n_estimators": 100, - "max_bin": 256, - "scale_pos_weight": 2, - "lambda_l2": 1, - "alpha": 0.9, - "max_depth": 8, - "num_leaves": 2**8, - "verbosity": 0, - "objective": "multi:softproba", - "learning_rate": 0.3, - "num_class": num_classes, - "early_stopping_rounds": 5, - } - - xgb_clf = xgb.XGBClassifier(**params) - xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) - booster = xgb_clf.get_booster() - - xgb_prediction = xgb_clf.predict(X_test) - xgb_proba = xgb_clf.predict_proba(X_test) - xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(y_test)) - - daal_model = d4p.mb.convert_model(booster) - - daal_prediction = daal_model.predict(X_test) - daal_proba = daal_model.predict_proba(X_test) - daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(y_test)) - - self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) - self.assertTrue(np.allclose(xgb_proba, daal_proba)) - - @unittest.skipUnless( - all( - [ - hasattr(d4p, "get_gbt_model_from_xgboost"), - hasattr(d4p, "gbt_classification_prediction"), - daal_check_version(((2021, "P", 1))), - ] - ), - reason, - ) - @unittest.skipUnless( - importlib.util.find_spec("xgboost") is not None, - "xgboost library is not installed", - ) - def test_model_from_booster(self): - class MockBooster: - def get_dump(self, *_, **kwargs): - # raw dump of 2 trees with a max depth of 1 - return [ - ' { "nodeid": 0, "depth": 0, "split": "1", "split_condition": 2, "yes": 1, "no": 2, "missing": 1 , "gain": 3, "cover": 4, "children": [\n { "nodeid": 1, "leaf": 5 , "cover": 6 }, \n { "nodeid": 2, "leaf": 7 , "cover":8 }\n ]}', - ' { "nodeid": 0, "leaf": 0.2 , "cover": 42 }', - ] - - mock = MockBooster() - result = d4p.TreeList.from_booster(mock) - self.assertEqual(len(result), 2) - - tree0 = result[0] - self.assertIsInstance(tree0, d4p.TreeView) - self.assertFalse(tree0.is_leaf) - with self.assertRaises(ValueError): - tree0.cover - with self.assertRaises(ValueError): - tree0.value - - self.assertIsInstance(tree0.root_node, d4p.Node) - - self.assertEqual(tree0.root_node.node_id, 0) - self.assertEqual(tree0.root_node.left_child.node_id, 1) - self.assertEqual(tree0.root_node.right_child.node_id, 2) - - self.assertEqual(tree0.root_node.cover, 4) - self.assertEqual(tree0.root_node.left_child.cover, 6) - self.assertEqual(tree0.root_node.right_child.cover, 8) - - self.assertFalse(tree0.root_node.is_leaf) - self.assertTrue(tree0.root_node.left_child.is_leaf) - self.assertTrue(tree0.root_node.right_child.is_leaf) - - self.assertTrue(tree0.root_node.default_left) - self.assertFalse(tree0.root_node.left_child.default_left) - self.assertFalse(tree0.root_node.right_child.default_left) - - self.assertEqual(tree0.root_node.feature, 1) - with self.assertRaises(ValueError): - tree0.root_node.left_child.feature - with self.assertRaises(ValueError): - tree0.root_node.right_child.feature - - self.assertEqual(tree0.root_node.value, 2) - self.assertEqual(tree0.root_node.left_child.value, 5) - self.assertEqual(tree0.root_node.right_child.value, 7) - - self.assertEqual(tree0.root_node.n_children, 2) - self.assertEqual(tree0.root_node.left_child.n_children, 0) - self.assertEqual(tree0.root_node.right_child.n_children, 0) - - self.assertIsNone(tree0.root_node.left_child.left_child) - self.assertIsNone(tree0.root_node.left_child.right_child) - self.assertIsNone(tree0.root_node.right_child.left_child) - self.assertIsNone(tree0.root_node.right_child.right_child) - - tree1 = result[1] - self.assertIsInstance(tree1, d4p.TreeView) - self.assertTrue(tree1.is_leaf) - self.assertEqual(tree1.n_nodes, 1) - self.assertEqual(tree1.cover, 42) - self.assertEqual(tree1.value, 0.2) - - -if __name__ == "__main__": - unittest.main() From bdde09630f64d6292cad8ab75e74dee6393d6023 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 4 Oct 2023 08:14:37 -0700 Subject: [PATCH 22/64] fix: support XGBoost models trained with early stopping --- src/gbt_convertors.pyx | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index b7484c7ad8..f0c5edf2e7 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -15,10 +15,10 @@ # =============================================================================== import json -import logging from collections import deque from tempfile import NamedTemporaryFile from typing import Any, Deque, Dict, List, Optional, Tuple +from warnings import warn import numpy as np @@ -173,10 +173,10 @@ class TreeView: class TreeList(list): """Helper class that is able to extract all information required by the - model builders from an XGBoost.Booster object""" + model builders from various objects""" @staticmethod - def from_xgb_booster(booster) -> "TreeList": + def from_xgb_booster(booster, max_trees: int) -> "TreeList": """ Load a TreeList from an xgb.Booster object Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, @@ -185,6 +185,8 @@ class TreeList(list): tl = TreeList() dump = booster.get_dump(dump_format="json", with_stats=True) for tree_id, raw_tree in enumerate(dump): + if max_trees > 0 and tree_id == max_trees: + break raw_tree_parsed = json.loads(raw_tree) root_node = Node.from_xgb_dict(raw_tree_parsed) tl.append(TreeView(tree_id=tree_id, root_node=root_node)) @@ -194,7 +196,7 @@ class TreeList(list): @staticmethod def from_lightgbm_booster_dump(dump: Dict[str, Any]) -> "TreeList": """ - Load a TreeList from a lgbm.Model object + Load a TreeList from a lgbm Booster dump Note: We cannot type-hint the the Model without loading lightgbm as dependency in pyx code, therefore not type hint is added. """ @@ -382,9 +384,17 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: else: is_regression = True - n_iterations = booster.best_iteration + 1 + max_trees = ( + getattr(booster, "best_iteration", -1) + 1 + ) # 0 if best_iteration does not exist + if n_classes > 2: + max_trees *= n_classes + tree_list = TreeList.from_xgb_booster(booster, max_trees) - tree_list = TreeList.from_xgb_booster(booster) + if hasattr(booster, "best_iteration"): + n_iterations = booster.best_iteration + 1 + else: + n_iterations = len(tree_list) // (n_classes if n_classes > 2 else 1) return get_gbt_model_from_tree_list( tree_list, @@ -708,7 +718,5 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: cover=0.0, ) - logging.warning( - "Models converted from CatBoost cannot be used for SHAP value calculation" - ) + warn("Models converted from CatBoost cannot be used for SHAP value calculation") return mb.model() From 89fde707aa70d3ac644c02f2a286c498eb1ba26f Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 4 Oct 2023 08:15:43 -0700 Subject: [PATCH 23/64] refactor: simplify early stopping test case --- tests/test_model_builders.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index c84dc28a73..73a1a6f804 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -495,20 +495,22 @@ def setUpClass(cls) -> None: "learning_rate": 0.3, "num_class": num_classes, "early_stopping_rounds": 5, + "verbose_eval": False, } cls.xgb_clf = xgb.XGBClassifier(**params) - cls.xgb_clf.fit(X_train, y_train, eval_set=[(cls.X_test, cls.y_test)]) + cls.xgb_clf.fit( + X_train, y_train, eval_set=[(cls.X_test, cls.y_test)], verbose=False + ) + cls.daal_model = d4p.mb.convert_model(cls.xgb_clf.get_booster()) def test_early_stopping(self): xgb_prediction = self.xgb_clf.predict(self.X_test) xgb_proba = self.xgb_clf.predict_proba(self.X_test) xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(self.y_test)) - booster = self.xgb_clf.get_booster() - daal_model = d4p.mb.convert_model(booster) - daal_prediction = daal_model.predict(self.X_test) - daal_proba = daal_model.predict_proba(self.X_test) + daal_prediction = self.daal_model.predict(self.X_test) + daal_proba = self.daal_model.predict_proba(self.X_test) daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(self.y_test)) self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) From fb7b209f82fb22a15126fdc7a75794bd702b6cde Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 10 Oct 2023 01:09:37 -0700 Subject: [PATCH 24/64] fix: add SHAP to requirements-test --- requirements-test.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements-test.txt b/requirements-test.txt index b9bd3d05c3..a34e1b2cd0 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,3 +9,5 @@ pandas==2.1.1 ; python_version >= '3.9' xgboost==1.7.6; python_version <= '3.9' xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 +catboost==1.2.1 +shap==0.42.1 From 7a2e892c69587bc8b47c1cd26b6017fcee6e796a Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 10 Oct 2023 01:28:46 -0700 Subject: [PATCH 25/64] chore: update oneDAL version for _gbt_inference_api_versision 2 --- src/gbt_model_builder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index b26636387b..2de4b6a0da 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -22,7 +22,7 @@ #include #include "onedal/version.hpp" -#if (((MAJOR_VERSION == 2024) && (MINOR_VERSION >= 1)) || (MAJOR_VERSION > 2024)) +#if (((MAJOR_VERSION == 2024) && (MINOR_VERSION == 0) && (UPDATE_VERSION >= 1)) || ((MAJOR_VERSION > 2024) && (MINOR_VERSION >= 1))) #define _gbt_inference_api_version 2 #elif (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) #define _gbt_inference_api_version 1 From 697622ef11b15fc64e8c3727cdf13ff1da6f8c9e Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 10 Oct 2023 03:08:40 -0700 Subject: [PATCH 26/64] Add GBT model builder API version descriptions --- src/gbt_convertors.py.bak | 722 ++++++++++++++++++++++++++++++++++++++ src/gbt_model_builder.h | 3 + 2 files changed, 725 insertions(+) create mode 100755 src/gbt_convertors.py.bak diff --git a/src/gbt_convertors.py.bak b/src/gbt_convertors.py.bak new file mode 100755 index 0000000000..f0c5edf2e7 --- /dev/null +++ b/src/gbt_convertors.py.bak @@ -0,0 +1,722 @@ +# =============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +import json +from collections import deque +from tempfile import NamedTemporaryFile +from typing import Any, Deque, Dict, List, Optional, Tuple +from warnings import warn + +import numpy as np + + +class CatBoostNode: + def __init__( + self, + split: Optional[float] = None, + value: Optional[List[float]] = None, + right: Optional[int] = None, + left: Optional[float] = None, + cover: Optional[float] = None, + ) -> None: + self.split = split + self.value = value + self.right = right + self.left = left + self.cover = cover + + +class Node: + """Helper class holding Tree Node information""" + + def __init__( + self, + cover: float, + is_leaf: bool, + default_left: bool, + feature: int, + value: float, + n_children: int = 0, + left_child: "Optional[Node]" = None, + right_child: "Optional[Node]" = None, + parent_id: Optional[int] = -1, + position: Optional[int] = -1, + ) -> None: + self.cover = cover + self.is_leaf = is_leaf + self.default_left = default_left + self.__feature = feature + self.value = value + self.n_children = n_children + self.left_child = left_child + self.right_child = right_child + self.parent_id = parent_id + self.position = position + + @staticmethod + def from_xgb_dict(input_dict: Dict[str, Any]) -> "Node": + if "children" in input_dict: + left_child = Node.from_xgb_dict(input_dict["children"][0]) + right_child = Node.from_xgb_dict(input_dict["children"][1]) + n_children = 2 + left_child.n_children + right_child.n_children + else: + left_child = None + right_child = None + n_children = 0 + is_leaf = "leaf" in input_dict + default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"] + return Node( + cover=input_dict["cover"], + is_leaf=is_leaf, + default_left=default_left, + feature=input_dict.get("split"), + value=input_dict["leaf"] if is_leaf else input_dict["split_condition"], + n_children=n_children, + left_child=left_child, + right_child=right_child, + ) + + @staticmethod + def from_lightgbm_dict(input_dict: Dict[str, Any]) -> "Node": + if "tree_structure" in input_dict: + tree = input_dict["tree_structure"] + else: + tree = input_dict + + n_children = 0 + if "left_child" in tree: + left_child = Node.from_lightgbm_dict(tree["left_child"]) + n_children += 1 + left_child.n_children + else: + left_child = None + if "right_child" in tree: + right_child = Node.from_lightgbm_dict(tree["right_child"]) + n_children += 1 + right_child.n_children + else: + right_child = None + + is_leaf = "leaf_value" in tree + return Node( + cover=tree["leaf_count"] if is_leaf else tree["internal_count"], + is_leaf=is_leaf, + default_left=is_leaf or tree["default_left"], + feature=tree.get("split_feature"), + value=tree["leaf_value"] if is_leaf else tree["threshold"], + n_children=n_children, + left_child=left_child, + right_child=right_child, + ) + + def get_value_closest_float_downward(self) -> np.float64: + """Get the closest exact fp value smaller than self.value""" + return np.nextafter(np.single(self.value), np.single(-np.inf)) + + def get_children(self) -> "Optional[Tuple[Node, Node]]": + if not self.left_child or not self.right_child: + assert self.is_leaf + else: + return (self.left_child, self.right_child) + + @property + def feature(self) -> int: + if isinstance(self.__feature, int): + return self.__feature + if isinstance(self.__feature, str) and self.__feature.isnumeric(): + return int(self.__feature) + raise ValueError( + f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" + ) + + +class TreeView: + """Helper class, treating a list of nodes as one tree""" + + def __init__(self, tree_id: int, root_node: Node) -> None: + self.tree_id = tree_id + self.root_node = root_node + + @property + def is_leaf(self) -> bool: + return self.root_node.is_leaf + + @property + def value(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + if self.root_node.value is None: + raise ValueError("Tree is leaf-only but leaf node has no value") + return self.root_node.value + + @property + def cover(self) -> float: + if not self.is_leaf: + raise ValueError("Tree is not a leaf-only tree") + return self.root_node.cover + + @property + def n_nodes(self) -> int: + return self.root_node.n_children + 1 + + +class TreeList(list): + """Helper class that is able to extract all information required by the + model builders from various objects""" + + @staticmethod + def from_xgb_booster(booster, max_trees: int) -> "TreeList": + """ + Load a TreeList from an xgb.Booster object + Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, + therefore not type hint is added. + """ + tl = TreeList() + dump = booster.get_dump(dump_format="json", with_stats=True) + for tree_id, raw_tree in enumerate(dump): + if max_trees > 0 and tree_id == max_trees: + break + raw_tree_parsed = json.loads(raw_tree) + root_node = Node.from_xgb_dict(raw_tree_parsed) + tl.append(TreeView(tree_id=tree_id, root_node=root_node)) + + return tl + + @staticmethod + def from_lightgbm_booster_dump(dump: Dict[str, Any]) -> "TreeList": + """ + Load a TreeList from a lgbm Booster dump + Note: We cannot type-hint the the Model without loading lightgbm as dependency in pyx code, + therefore not type hint is added. + """ + tl = TreeList() + for tree_id, tree_dict in enumerate(dump["tree_info"]): + root_node = Node.from_lightgbm_dict(tree_dict) + tl.append(TreeView(tree_id=tree_id, root_node=root_node)) + + return tl + + def __setitem__(self): + raise NotImplementedError( + "Use TreeList.from_*() methods to initialize a TreeList" + ) + + +def get_lightgbm_params(booster): + return booster.dump_model() + + +def get_xgboost_params(booster): + return json.loads(booster.save_config()) + + +def get_catboost_params(booster): + with NamedTemporaryFile() as fp: + booster.save_model(fp.name, "json") + fp.seek(0) + model_data = json.load(fp) + return model_data + + +def get_gbt_model_from_tree_list( + tree_list: TreeList, + n_iterations: int, + is_regression: bool, + n_features: int, + n_classes: int, + base_score: float, + add_base_score_as_tree: bool, +): + """Return a GBT Model from TreeList""" + + if is_regression: + if add_base_score_as_tree: + mb = gbt_reg_model_builder( + n_features=n_features, n_iterations=n_iterations + 1 + ) + tree_id = mb.create_tree(1) + mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) + else: + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) + else: + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + + class_label = 0 + for counter, tree in enumerate(tree_list, start=1): + # find out the number of nodes in the tree + if is_regression: + tree_id = mb.create_tree(tree.n_nodes) + else: + tree_id = mb.create_tree(n_nodes=tree.n_nodes, class_label=class_label) + + if counter % n_iterations == 0: + class_label += 1 + + if tree.is_leaf: + mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) + continue + + root_node = tree.root_node + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=root_node.feature, + feature_value=root_node.get_value_closest_float_downward(), + cover=root_node.cover, + default_left=root_node.default_left, + ) + + # create queue + node_queue: Deque[Node] = deque() + children = root_node.get_children() + assert children is not None + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + while node_queue: + node = node_queue.popleft() + assert node.parent_id != -1, "node.parent_id must not be -1" + assert node.position != -1, "node.position must not be -1" + + if node.is_leaf: + mb.add_leaf( + tree_id=tree_id, + response=node.value, + cover=node.cover, + parent_id=node.parent_id, + position=node.position, + ) + else: + parent_id = mb.add_split( + tree_id=tree_id, + feature_index=node.feature, + feature_value=node.get_value_closest_float_downward(), + cover=node.cover, + default_left=node.default_left, + parent_id=node.parent_id, + position=node.position, + ) + + children = node.get_children() + assert children is not None + for position, child in enumerate(children): + child.parent_id = parent_id + child.position = position + node_queue.append(child) + + return mb.model() + + +def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any: + if booster is None: + booster = model.dump_model() + + n_features = booster["max_feature_idx"] + 1 + n_iterations = len(booster["tree_info"]) / booster["num_tree_per_iteration"] + n_classes = booster["num_tree_per_iteration"] + + is_regression = False + objective_fun = booster["objective"] + if n_classes > 2: + if "multiclass" not in objective_fun: + raise TypeError( + "multiclass (softmax) objective is only supported for multiclass classification" + ) + elif "binary" in objective_fun: # nClasses == 1 + n_classes = 2 + else: + is_regression = True + + tree_list = TreeList.from_lightgbm_booster_dump(booster) + + return get_gbt_model_from_tree_list( + tree_list, + n_iterations=n_iterations, + is_regression=is_regression, + n_features=n_features, + n_classes=n_classes, + base_score=0, + add_base_score_as_tree=False, + ) + + +def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: + # Release Note for XGBoost 1.5.0: Python interface now supports configuring + # constraints using feature names instead of feature indices. This also + # helps with pandas input with set feature names. + booster.feature_names = [str(i) for i in range(booster.num_features())] + + if xgb_config is None: + xgb_config = get_xgboost_params(booster) + + n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) + n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) + base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) + + is_regression = False + objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] + if n_classes > 2: + if objective_fun not in ["multi:softprob", "multi:softmax"]: + raise TypeError( + "multi:softprob and multi:softmax are only supported for multiclass classification" + ) + elif objective_fun.find("binary:") == 0: + if objective_fun in ["binary:logistic", "binary:logitraw"]: + n_classes = 2 + else: + raise TypeError( + "binary:logistic and binary:logitraw are only supported for binary classification" + ) + else: + is_regression = True + + max_trees = ( + getattr(booster, "best_iteration", -1) + 1 + ) # 0 if best_iteration does not exist + if n_classes > 2: + max_trees *= n_classes + tree_list = TreeList.from_xgb_booster(booster, max_trees) + + if hasattr(booster, "best_iteration"): + n_iterations = booster.best_iteration + 1 + else: + n_iterations = len(tree_list) // (n_classes if n_classes > 2 else 1) + + return get_gbt_model_from_tree_list( + tree_list, + n_iterations=n_iterations, + is_regression=is_regression, + n_features=n_features, + n_classes=n_classes, + base_score=base_score, + add_base_score_as_tree=True, + ) + + +def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: + if not model.is_fitted(): + raise RuntimeError("Model should be fitted before exporting to daal4py.") + + if model_data is None: + model_data = get_catboost_params(model) + + if "categorical_features" in model_data["features_info"]: + raise NotImplementedError( + "Categorical features are not supported in daal4py Gradient Boosting Trees" + ) + + n_features = len(model_data["features_info"]["float_features"]) + + is_symmetric_tree = ( + model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] + == "SymmetricTree" + ) + + if is_symmetric_tree: + n_iterations = len(model_data["oblivious_trees"]) + else: + n_iterations = len(model_data["trees"]) + + n_classes = 0 + + if "class_params" in model_data["model_info"]: + is_classification = True + n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) + mb = gbt_clf_model_builder( + n_features=n_features, n_iterations=n_iterations, n_classes=n_classes + ) + else: + is_classification = False + mb = gbt_reg_model_builder(n_features, n_iterations) + + splits = [] + + # Create splits array (all splits are placed sequentially) + for feature in model_data["features_info"]["float_features"]: + if feature["borders"]: + for feature_border in feature["borders"]: + splits.append( + {"feature_index": feature["feature_index"], "value": feature_border} + ) + + if not is_classification: + bias = model_data["scale_and_bias"][1][0] / n_iterations + scale = model_data["scale_and_bias"][0] + else: + bias = 0 + scale = 1 + + trees_explicit = [] + tree_symmetric = [] + + if ( + model_data["model_info"]["params"]["data_processing_options"][ + "float_features_binarization" + ]["nan_mode"] + == "Min" + ): + default_left = 1 + else: + default_left = 0 + + for tree_num in range(n_iterations): + if is_symmetric_tree: + if model_data["oblivious_trees"][tree_num]["splits"] is not None: + # Tree has more than 1 node + cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) + else: + cur_tree_depth = 0 + + tree_symmetric.append( + (model_data["oblivious_trees"][tree_num], cur_tree_depth) + ) + else: + n_nodes = 1 + # Check if node is a leaf (in case of stump) + if "split" in model_data["trees"][tree_num]: + # Get number of trees and splits info via BFS + # Create queue + nodes_queue = [] + root_node = CatBoostNode( + split=splits[model_data["trees"][tree_num]["split"]["split_index"]] + ) + nodes_queue.append((model_data["trees"][tree_num], root_node)) + while nodes_queue: + cur_node_data, cur_node = nodes_queue.pop(0) + if "value" in cur_node_data: + if isinstance(cur_node_data["value"], list): + cur_node.value = [value for value in cur_node_data["value"]] + else: + cur_node.value = [cur_node_data["value"] * scale + bias] + else: + cur_node.split = splits[cur_node_data["split"]["split_index"]] + left_node = CatBoostNode() + right_node = CatBoostNode() + cur_node.left = left_node + cur_node.right = right_node + nodes_queue.append((cur_node_data["left"], left_node)) + nodes_queue.append((cur_node_data["right"], right_node)) + n_nodes += 2 + else: + root_node = CatBoostNode() + if is_classification and n_classes > 2: + root_node.value = [ + value * scale for value in model_data["trees"][tree_num]["value"] + ] + else: + root_node.value = [ + model_data["trees"][tree_num]["value"] * scale + bias + ] + trees_explicit.append((root_node, n_nodes)) + + tree_id = [] + class_label = 0 + count = 0 + + # Only 1 tree for each iteration in case of regression or binary classification + if not is_classification or n_classes == 2: + n_tree_each_iter = 1 + else: + n_tree_each_iter = n_classes + + # Create id for trees (for the right order in modelbuilder) + for i in range(n_iterations): + for c in range(n_tree_each_iter): + if is_symmetric_tree: + n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 + else: + n_nodes = trees_explicit[i][1] + + if is_classification and n_classes > 2: + tree_id.append(mb.create_tree(n_nodes, class_label)) + count += 1 + if count == n_iterations: + class_label += 1 + count = 0 + + elif is_classification: + tree_id.append(mb.create_tree(n_nodes, 0)) + else: + tree_id.append(mb.create_tree(n_nodes)) + + if is_symmetric_tree: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + cur_tree_info = tree_symmetric[i][0] + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + cur_tree_leaf_val = cur_tree_info["leaf_values"] + cur_tree_depth = tree_symmetric[i][1] + + if cur_tree_depth == 0: + mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) + else: + # One split used for the whole level + cur_level_split = splits[ + cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] + ] + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + cover=0.0, + ) + prev_level_nodes = [root_id] + + # Iterate over levels, splits in json are reversed (root split is the last) + for cur_level in range(cur_tree_depth - 2, -1, -1): + cur_level_nodes = [] + for cur_parent in prev_level_nodes: + cur_level_split = splits[ + cur_tree_info["splits"][cur_level]["split_index"] + ] + cur_left_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=0, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + cover=0.0, + ) + cur_right_node = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_parent, + position=1, + feature_index=cur_level_split["feature_index"], + feature_value=cur_level_split["value"], + default_left=default_left, + cover=0.0, + ) + cur_level_nodes.append(cur_left_node) + cur_level_nodes.append(cur_right_node) + prev_level_nodes = cur_level_nodes + + # Different storing format for leaves + if not is_classification or n_classes == 2: + for last_level_node_num in range(len(prev_level_nodes)): + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + cover=0.0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[2 * last_level_node_num + 1] + * scale + + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + cover=0.0, + ) + else: + for last_level_node_num in range(len(prev_level_nodes)): + left_index = ( + 2 * last_level_node_num * n_tree_each_iter + class_label + ) + right_index = ( + 2 * last_level_node_num + 1 + ) * n_tree_each_iter + class_label + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[left_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=0, + cover=0.0, + ) + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_tree_leaf_val[right_index] * scale + bias, + parent_id=prev_level_nodes[last_level_node_num], + position=1, + cover=0.0, + ) + else: + for class_label in range(n_tree_each_iter): + for i in range(n_iterations): + root_node = trees_explicit[i][0] + + cur_tree_id = tree_id[i * n_tree_each_iter + class_label] + # Traverse tree via BFS and build tree with modelbuilder + if root_node.value is None: + root_id = mb.add_split( + tree_id=cur_tree_id, + feature_index=root_node.split["feature_index"], + feature_value=root_node.split["value"], + default_left=default_left, + cover=0.0, + ) + nodes_queue = [(root_node, root_id)] + while nodes_queue: + cur_node, cur_node_id = nodes_queue.pop(0) + left_node = cur_node.left + # Check if node is a leaf + if left_node.value is None: + left_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=0, + feature_index=left_node.split["feature_index"], + feature_value=left_node.split["value"], + default_left=default_left, + cover=0.0, + ) + nodes_queue.append((left_node, left_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=left_node.value[class_label], + parent_id=cur_node_id, + position=0, + cover=0.0, + ) + right_node = cur_node.right + # Check if node is a leaf + if right_node.value is None: + right_node_id = mb.add_split( + tree_id=cur_tree_id, + parent_id=cur_node_id, + position=1, + feature_index=right_node.split["feature_index"], + feature_value=right_node.split["value"], + default_left=default_left, + cover=0.0, + ) + nodes_queue.append((right_node, right_node_id)) + else: + mb.add_leaf( + tree_id=cur_tree_id, + response=cur_node.right.value[class_label], + parent_id=cur_node_id, + position=1, + cover=0.0, + ) + + else: + # Tree has only one node + mb.add_leaf( + tree_id=cur_tree_id, + response=root_node.value[class_label], + cover=0.0, + ) + + warn("Models converted from CatBoost cannot be used for SHAP value calculation") + return mb.model() diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index 2de4b6a0da..7ebcc32838 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -23,8 +23,11 @@ #include "onedal/version.hpp" #if (((MAJOR_VERSION == 2024) && (MINOR_VERSION == 0) && (UPDATE_VERSION >= 1)) || ((MAJOR_VERSION > 2024) && (MINOR_VERSION >= 1))) + // added missing value support to GBT regression + // added SHAP value support #define _gbt_inference_api_version 2 #elif (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) + // added missing value support to GBT classification #define _gbt_inference_api_version 1 #else #define _gbt_inference_api_version 0 From 89b37b32d44c6aecd748795c69dfcfe9a8d75658 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 10 Oct 2023 03:08:52 -0700 Subject: [PATCH 27/64] Fix typo in pred_interactions test --- tests/test_model_builders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 73a1a6f804..ecb464d58f 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -358,7 +358,7 @@ def test_model_predict_shap_contribs(self): def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): - m.predict(self.X_test, pred_interactions == True) + m.predict(self.X_test, pred_interactions=True) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) @@ -529,7 +529,7 @@ def get_dump(self, *_, **kwargs): ] mock = MockBooster() - result = d4p.TreeList.from_booster(mock) + result = d4p.TreeList.from_xgb_booster(mock) self.assertEqual(len(result), 2) tree0 = result[0] From e4ab316850ab8dd38af97580e5880b66bbad8257 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 10 Oct 2023 03:25:37 -0700 Subject: [PATCH 28/64] fix: remove local backup file --- src/gbt_convertors.py.bak | 722 -------------------------------------- 1 file changed, 722 deletions(-) delete mode 100755 src/gbt_convertors.py.bak diff --git a/src/gbt_convertors.py.bak b/src/gbt_convertors.py.bak deleted file mode 100755 index f0c5edf2e7..0000000000 --- a/src/gbt_convertors.py.bak +++ /dev/null @@ -1,722 +0,0 @@ -# =============================================================================== -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import json -from collections import deque -from tempfile import NamedTemporaryFile -from typing import Any, Deque, Dict, List, Optional, Tuple -from warnings import warn - -import numpy as np - - -class CatBoostNode: - def __init__( - self, - split: Optional[float] = None, - value: Optional[List[float]] = None, - right: Optional[int] = None, - left: Optional[float] = None, - cover: Optional[float] = None, - ) -> None: - self.split = split - self.value = value - self.right = right - self.left = left - self.cover = cover - - -class Node: - """Helper class holding Tree Node information""" - - def __init__( - self, - cover: float, - is_leaf: bool, - default_left: bool, - feature: int, - value: float, - n_children: int = 0, - left_child: "Optional[Node]" = None, - right_child: "Optional[Node]" = None, - parent_id: Optional[int] = -1, - position: Optional[int] = -1, - ) -> None: - self.cover = cover - self.is_leaf = is_leaf - self.default_left = default_left - self.__feature = feature - self.value = value - self.n_children = n_children - self.left_child = left_child - self.right_child = right_child - self.parent_id = parent_id - self.position = position - - @staticmethod - def from_xgb_dict(input_dict: Dict[str, Any]) -> "Node": - if "children" in input_dict: - left_child = Node.from_xgb_dict(input_dict["children"][0]) - right_child = Node.from_xgb_dict(input_dict["children"][1]) - n_children = 2 + left_child.n_children + right_child.n_children - else: - left_child = None - right_child = None - n_children = 0 - is_leaf = "leaf" in input_dict - default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"] - return Node( - cover=input_dict["cover"], - is_leaf=is_leaf, - default_left=default_left, - feature=input_dict.get("split"), - value=input_dict["leaf"] if is_leaf else input_dict["split_condition"], - n_children=n_children, - left_child=left_child, - right_child=right_child, - ) - - @staticmethod - def from_lightgbm_dict(input_dict: Dict[str, Any]) -> "Node": - if "tree_structure" in input_dict: - tree = input_dict["tree_structure"] - else: - tree = input_dict - - n_children = 0 - if "left_child" in tree: - left_child = Node.from_lightgbm_dict(tree["left_child"]) - n_children += 1 + left_child.n_children - else: - left_child = None - if "right_child" in tree: - right_child = Node.from_lightgbm_dict(tree["right_child"]) - n_children += 1 + right_child.n_children - else: - right_child = None - - is_leaf = "leaf_value" in tree - return Node( - cover=tree["leaf_count"] if is_leaf else tree["internal_count"], - is_leaf=is_leaf, - default_left=is_leaf or tree["default_left"], - feature=tree.get("split_feature"), - value=tree["leaf_value"] if is_leaf else tree["threshold"], - n_children=n_children, - left_child=left_child, - right_child=right_child, - ) - - def get_value_closest_float_downward(self) -> np.float64: - """Get the closest exact fp value smaller than self.value""" - return np.nextafter(np.single(self.value), np.single(-np.inf)) - - def get_children(self) -> "Optional[Tuple[Node, Node]]": - if not self.left_child or not self.right_child: - assert self.is_leaf - else: - return (self.left_child, self.right_child) - - @property - def feature(self) -> int: - if isinstance(self.__feature, int): - return self.__feature - if isinstance(self.__feature, str) and self.__feature.isnumeric(): - return int(self.__feature) - raise ValueError( - f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" - ) - - -class TreeView: - """Helper class, treating a list of nodes as one tree""" - - def __init__(self, tree_id: int, root_node: Node) -> None: - self.tree_id = tree_id - self.root_node = root_node - - @property - def is_leaf(self) -> bool: - return self.root_node.is_leaf - - @property - def value(self) -> float: - if not self.is_leaf: - raise ValueError("Tree is not a leaf-only tree") - if self.root_node.value is None: - raise ValueError("Tree is leaf-only but leaf node has no value") - return self.root_node.value - - @property - def cover(self) -> float: - if not self.is_leaf: - raise ValueError("Tree is not a leaf-only tree") - return self.root_node.cover - - @property - def n_nodes(self) -> int: - return self.root_node.n_children + 1 - - -class TreeList(list): - """Helper class that is able to extract all information required by the - model builders from various objects""" - - @staticmethod - def from_xgb_booster(booster, max_trees: int) -> "TreeList": - """ - Load a TreeList from an xgb.Booster object - Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, - therefore not type hint is added. - """ - tl = TreeList() - dump = booster.get_dump(dump_format="json", with_stats=True) - for tree_id, raw_tree in enumerate(dump): - if max_trees > 0 and tree_id == max_trees: - break - raw_tree_parsed = json.loads(raw_tree) - root_node = Node.from_xgb_dict(raw_tree_parsed) - tl.append(TreeView(tree_id=tree_id, root_node=root_node)) - - return tl - - @staticmethod - def from_lightgbm_booster_dump(dump: Dict[str, Any]) -> "TreeList": - """ - Load a TreeList from a lgbm Booster dump - Note: We cannot type-hint the the Model without loading lightgbm as dependency in pyx code, - therefore not type hint is added. - """ - tl = TreeList() - for tree_id, tree_dict in enumerate(dump["tree_info"]): - root_node = Node.from_lightgbm_dict(tree_dict) - tl.append(TreeView(tree_id=tree_id, root_node=root_node)) - - return tl - - def __setitem__(self): - raise NotImplementedError( - "Use TreeList.from_*() methods to initialize a TreeList" - ) - - -def get_lightgbm_params(booster): - return booster.dump_model() - - -def get_xgboost_params(booster): - return json.loads(booster.save_config()) - - -def get_catboost_params(booster): - with NamedTemporaryFile() as fp: - booster.save_model(fp.name, "json") - fp.seek(0) - model_data = json.load(fp) - return model_data - - -def get_gbt_model_from_tree_list( - tree_list: TreeList, - n_iterations: int, - is_regression: bool, - n_features: int, - n_classes: int, - base_score: float, - add_base_score_as_tree: bool, -): - """Return a GBT Model from TreeList""" - - if is_regression: - if add_base_score_as_tree: - mb = gbt_reg_model_builder( - n_features=n_features, n_iterations=n_iterations + 1 - ) - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) - else: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - - class_label = 0 - for counter, tree in enumerate(tree_list, start=1): - # find out the number of nodes in the tree - if is_regression: - tree_id = mb.create_tree(tree.n_nodes) - else: - tree_id = mb.create_tree(n_nodes=tree.n_nodes, class_label=class_label) - - if counter % n_iterations == 0: - class_label += 1 - - if tree.is_leaf: - mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) - continue - - root_node = tree.root_node - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=root_node.feature, - feature_value=root_node.get_value_closest_float_downward(), - cover=root_node.cover, - default_left=root_node.default_left, - ) - - # create queue - node_queue: Deque[Node] = deque() - children = root_node.get_children() - assert children is not None - for position, child in enumerate(children): - child.parent_id = parent_id - child.position = position - node_queue.append(child) - - while node_queue: - node = node_queue.popleft() - assert node.parent_id != -1, "node.parent_id must not be -1" - assert node.position != -1, "node.position must not be -1" - - if node.is_leaf: - mb.add_leaf( - tree_id=tree_id, - response=node.value, - cover=node.cover, - parent_id=node.parent_id, - position=node.position, - ) - else: - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=node.feature, - feature_value=node.get_value_closest_float_downward(), - cover=node.cover, - default_left=node.default_left, - parent_id=node.parent_id, - position=node.position, - ) - - children = node.get_children() - assert children is not None - for position, child in enumerate(children): - child.parent_id = parent_id - child.position = position - node_queue.append(child) - - return mb.model() - - -def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any: - if booster is None: - booster = model.dump_model() - - n_features = booster["max_feature_idx"] + 1 - n_iterations = len(booster["tree_info"]) / booster["num_tree_per_iteration"] - n_classes = booster["num_tree_per_iteration"] - - is_regression = False - objective_fun = booster["objective"] - if n_classes > 2: - if "multiclass" not in objective_fun: - raise TypeError( - "multiclass (softmax) objective is only supported for multiclass classification" - ) - elif "binary" in objective_fun: # nClasses == 1 - n_classes = 2 - else: - is_regression = True - - tree_list = TreeList.from_lightgbm_booster_dump(booster) - - return get_gbt_model_from_tree_list( - tree_list, - n_iterations=n_iterations, - is_regression=is_regression, - n_features=n_features, - n_classes=n_classes, - base_score=0, - add_base_score_as_tree=False, - ) - - -def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: - # Release Note for XGBoost 1.5.0: Python interface now supports configuring - # constraints using feature names instead of feature indices. This also - # helps with pandas input with set feature names. - booster.feature_names = [str(i) for i in range(booster.num_features())] - - if xgb_config is None: - xgb_config = get_xgboost_params(booster) - - n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) - n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) - base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) - - is_regression = False - objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] - if n_classes > 2: - if objective_fun not in ["multi:softprob", "multi:softmax"]: - raise TypeError( - "multi:softprob and multi:softmax are only supported for multiclass classification" - ) - elif objective_fun.find("binary:") == 0: - if objective_fun in ["binary:logistic", "binary:logitraw"]: - n_classes = 2 - else: - raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification" - ) - else: - is_regression = True - - max_trees = ( - getattr(booster, "best_iteration", -1) + 1 - ) # 0 if best_iteration does not exist - if n_classes > 2: - max_trees *= n_classes - tree_list = TreeList.from_xgb_booster(booster, max_trees) - - if hasattr(booster, "best_iteration"): - n_iterations = booster.best_iteration + 1 - else: - n_iterations = len(tree_list) // (n_classes if n_classes > 2 else 1) - - return get_gbt_model_from_tree_list( - tree_list, - n_iterations=n_iterations, - is_regression=is_regression, - n_features=n_features, - n_classes=n_classes, - base_score=base_score, - add_base_score_as_tree=True, - ) - - -def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: - if not model.is_fitted(): - raise RuntimeError("Model should be fitted before exporting to daal4py.") - - if model_data is None: - model_data = get_catboost_params(model) - - if "categorical_features" in model_data["features_info"]: - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - - n_features = len(model_data["features_info"]["float_features"]) - - is_symmetric_tree = ( - model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] - == "SymmetricTree" - ) - - if is_symmetric_tree: - n_iterations = len(model_data["oblivious_trees"]) - else: - n_iterations = len(model_data["trees"]) - - n_classes = 0 - - if "class_params" in model_data["model_info"]: - is_classification = True - n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - else: - is_classification = False - mb = gbt_reg_model_builder(n_features, n_iterations) - - splits = [] - - # Create splits array (all splits are placed sequentially) - for feature in model_data["features_info"]["float_features"]: - if feature["borders"]: - for feature_border in feature["borders"]: - splits.append( - {"feature_index": feature["feature_index"], "value": feature_border} - ) - - if not is_classification: - bias = model_data["scale_and_bias"][1][0] / n_iterations - scale = model_data["scale_and_bias"][0] - else: - bias = 0 - scale = 1 - - trees_explicit = [] - tree_symmetric = [] - - if ( - model_data["model_info"]["params"]["data_processing_options"][ - "float_features_binarization" - ]["nan_mode"] - == "Min" - ): - default_left = 1 - else: - default_left = 0 - - for tree_num in range(n_iterations): - if is_symmetric_tree: - if model_data["oblivious_trees"][tree_num]["splits"] is not None: - # Tree has more than 1 node - cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) - else: - cur_tree_depth = 0 - - tree_symmetric.append( - (model_data["oblivious_trees"][tree_num], cur_tree_depth) - ) - else: - n_nodes = 1 - # Check if node is a leaf (in case of stump) - if "split" in model_data["trees"][tree_num]: - # Get number of trees and splits info via BFS - # Create queue - nodes_queue = [] - root_node = CatBoostNode( - split=splits[model_data["trees"][tree_num]["split"]["split_index"]] - ) - nodes_queue.append((model_data["trees"][tree_num], root_node)) - while nodes_queue: - cur_node_data, cur_node = nodes_queue.pop(0) - if "value" in cur_node_data: - if isinstance(cur_node_data["value"], list): - cur_node.value = [value for value in cur_node_data["value"]] - else: - cur_node.value = [cur_node_data["value"] * scale + bias] - else: - cur_node.split = splits[cur_node_data["split"]["split_index"]] - left_node = CatBoostNode() - right_node = CatBoostNode() - cur_node.left = left_node - cur_node.right = right_node - nodes_queue.append((cur_node_data["left"], left_node)) - nodes_queue.append((cur_node_data["right"], right_node)) - n_nodes += 2 - else: - root_node = CatBoostNode() - if is_classification and n_classes > 2: - root_node.value = [ - value * scale for value in model_data["trees"][tree_num]["value"] - ] - else: - root_node.value = [ - model_data["trees"][tree_num]["value"] * scale + bias - ] - trees_explicit.append((root_node, n_nodes)) - - tree_id = [] - class_label = 0 - count = 0 - - # Only 1 tree for each iteration in case of regression or binary classification - if not is_classification or n_classes == 2: - n_tree_each_iter = 1 - else: - n_tree_each_iter = n_classes - - # Create id for trees (for the right order in modelbuilder) - for i in range(n_iterations): - for c in range(n_tree_each_iter): - if is_symmetric_tree: - n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 - else: - n_nodes = trees_explicit[i][1] - - if is_classification and n_classes > 2: - tree_id.append(mb.create_tree(n_nodes, class_label)) - count += 1 - if count == n_iterations: - class_label += 1 - count = 0 - - elif is_classification: - tree_id.append(mb.create_tree(n_nodes, 0)) - else: - tree_id.append(mb.create_tree(n_nodes)) - - if is_symmetric_tree: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - cur_tree_info = tree_symmetric[i][0] - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - cur_tree_leaf_val = cur_tree_info["leaf_values"] - cur_tree_depth = tree_symmetric[i][1] - - if cur_tree_depth == 0: - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) - else: - # One split used for the whole level - cur_level_split = splits[ - cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] - ] - root_id = mb.add_split( - tree_id=cur_tree_id, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - cover=0.0, - ) - prev_level_nodes = [root_id] - - # Iterate over levels, splits in json are reversed (root split is the last) - for cur_level in range(cur_tree_depth - 2, -1, -1): - cur_level_nodes = [] - for cur_parent in prev_level_nodes: - cur_level_split = splits[ - cur_tree_info["splits"][cur_level]["split_index"] - ] - cur_left_node = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_parent, - position=0, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - cover=0.0, - ) - cur_right_node = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_parent, - position=1, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - cover=0.0, - ) - cur_level_nodes.append(cur_left_node) - cur_level_nodes.append(cur_right_node) - prev_level_nodes = cur_level_nodes - - # Different storing format for leaves - if not is_classification or n_classes == 2: - for last_level_node_num in range(len(prev_level_nodes)): - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[2 * last_level_node_num] - * scale - + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=0, - cover=0.0, - ) - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[2 * last_level_node_num + 1] - * scale - + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=1, - cover=0.0, - ) - else: - for last_level_node_num in range(len(prev_level_nodes)): - left_index = ( - 2 * last_level_node_num * n_tree_each_iter + class_label - ) - right_index = ( - 2 * last_level_node_num + 1 - ) * n_tree_each_iter + class_label - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[left_index] * scale + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=0, - cover=0.0, - ) - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[right_index] * scale + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=1, - cover=0.0, - ) - else: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - root_node = trees_explicit[i][0] - - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - # Traverse tree via BFS and build tree with modelbuilder - if root_node.value is None: - root_id = mb.add_split( - tree_id=cur_tree_id, - feature_index=root_node.split["feature_index"], - feature_value=root_node.split["value"], - default_left=default_left, - cover=0.0, - ) - nodes_queue = [(root_node, root_id)] - while nodes_queue: - cur_node, cur_node_id = nodes_queue.pop(0) - left_node = cur_node.left - # Check if node is a leaf - if left_node.value is None: - left_node_id = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_node_id, - position=0, - feature_index=left_node.split["feature_index"], - feature_value=left_node.split["value"], - default_left=default_left, - cover=0.0, - ) - nodes_queue.append((left_node, left_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, - response=left_node.value[class_label], - parent_id=cur_node_id, - position=0, - cover=0.0, - ) - right_node = cur_node.right - # Check if node is a leaf - if right_node.value is None: - right_node_id = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_node_id, - position=1, - feature_index=right_node.split["feature_index"], - feature_value=right_node.split["value"], - default_left=default_left, - cover=0.0, - ) - nodes_queue.append((right_node, right_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_node.right.value[class_label], - parent_id=cur_node_id, - position=1, - cover=0.0, - ) - - else: - # Tree has only one node - mb.add_leaf( - tree_id=cur_tree_id, - response=root_node.value[class_label], - cover=0.0, - ) - - warn("Models converted from CatBoost cannot be used for SHAP value calculation") - return mb.model() From 0859330ee5d2819699eae061180dc7d1b1c96e93 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 10 Oct 2023 03:26:32 -0700 Subject: [PATCH 29/64] fix: remove local backup file --- src/gbt_convertors-recursive-append.pyx | 651 ------------------------ 1 file changed, 651 deletions(-) delete mode 100644 src/gbt_convertors-recursive-append.pyx diff --git a/src/gbt_convertors-recursive-append.pyx b/src/gbt_convertors-recursive-append.pyx deleted file mode 100644 index 6ffce3277d..0000000000 --- a/src/gbt_convertors-recursive-append.pyx +++ /dev/null @@ -1,651 +0,0 @@ -# =============================================================================== -# Copyright 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =============================================================================== - -import json -from collections import deque -from os import getpid, remove -from time import time -from typing import Any, Deque, Dict, Generator, List, Optional, Tuple - -import numpy as np -import xgboost as xgb - - -class CatBoostNode: - def __init__( - self, - split: Optional[float] = None, - value: Optional[List[float]] = None, - right: Optional[int] = None, - left: Optional[float] = None, - ) -> None: - self.split = split - self.value = value - self.right = right - self.left = left - - -class LightGbmNode: - def __init__(self, tree: Dict[str, Any], parent_id: int, position: int) -> None: - self.tree = tree - self.parent_id = parent_id - self.position = position - - -class Node: - """Helper class holding Tree Node information""" - - def __init__( - self, - node_id: int, - cover: float, - is_leaf: bool, - default_left: bool, - feature: int, - value: float, - n_children: int = 0, - left_child: "Optional[Node]" = None, - right_child: "Optional[Node]" = None, - parent_id: Optional[int] = -1, - position: Optional[int] = -1, - ) -> None: - self.node_id = node_id - self.cover = cover - self.is_leaf = is_leaf - self.default_left = default_left - self.__feature = feature - self.value = value - self.n_children = n_children - self.left_child = left_child - self.right_child = right_child - self.parent_id = parent_id - self.position = position - - @staticmethod - def from_dict(input_dict: Dict[str, Any]) -> "Node": - if "children" in input_dict: - left_child = Node.from_dict(input_dict["children"][0]) - right_child = Node.from_dict(input_dict["children"][1]) - n_children = 2 + left_child.n_children + right_child.n_children - else: - left_child = None - right_child = None - n_children = 0 - is_leaf = "leaf" in input_dict - default_left = "yes" in input_dict and input_dict["yes"] == input_dict["missing"] - return Node( - node_id=input_dict["nodeid"], - cover=input_dict["cover"], - is_leaf=is_leaf, - default_left=default_left, - feature=input_dict.get("split"), - value=input_dict["leaf"] if is_leaf else input_dict["split_condition"], - n_children=n_children, - left_child=left_child, - right_child=right_child, - ) - - def get_value_closest_float_downward(self) -> np.float64: - """Get the closest exact fp value smaller than self.value""" - return np.nextafter(np.single(self.value), np.single(-np.inf)) - - def get_children(self) -> "Optional[Tuple[Node, Node]]": - if not self.left_child or not self.right_child: - assert self.is_leaf - else: - return (self.left_child, self.right_child) - - @property - def feature(self) -> int: - if not (isinstance(self.__feature, str) and self.__feature.isnumeric()): - raise ValueError( - f"Feature names must be integers (got ({type(self.__feature)}){self.__feature})" - ) - return int(self.__feature) - - -class TreeView: - """Helper class, treating a list of nodes as one tree""" - - def __init__(self, tree_id: int, root_node: Node) -> None: - self.tree_id = tree_id - self.root_node = root_node - - @property - def is_leaf(self) -> bool: - return self.root_node.is_leaf - - @property - def value(self) -> float: - if not self.is_leaf: - raise ValueError("Tree is not a leaf-only tree") - if not self.root_node.value: - raise ValueError("Tree is leaf-only but leaf node has no value") - return self.root_node.value - - @property - def cover(self) -> float: - if not self.is_leaf: - raise ValueError("Tree is not a leaf-only tree") - return self.root_node.cover - - @property - def n_nodes(self) -> int: - return self.root_node.n_children + 1 - - -class TreeList: - """Helper class that is able to extract all information required by the - model builders from an XGBoost.Booster object""" - - def __init__(self): - self.tree_views: List[TreeView] = [] - - @staticmethod - def from_booster(booster) -> "TreeList": - """ - Load a TreeList from an xgb.Booster object - Note: We cannot type-hint the xgb.Booster without loading xgb as dependency in pyx code, - therefore not type hint is added. - """ - tl = TreeList() - dump = booster.get_dump(dump_format="json", with_stats=True) - for tree_id, raw_tree in enumerate(dump): - raw_tree_parsed = json.loads(raw_tree) - root_node = Node.from_dict(raw_tree_parsed) - tl.append(TreeView(tree_id=tree_id, root_node=root_node)) - - return tl - - def append(self, elem): - self.tree_views.append(elem) - - def __iter__(self) -> Generator[TreeView, None, None]: - """Iterate over TreeViews""" - for tree_view in self.tree_views: - yield tree_view - - def __setitem__(self): - raise NotImplementedError("Use TreeList.from_booster() to initialize a TreeList") - - -def get_lightgbm_params(booster): - return booster.dump_model() - - -def get_xgboost_params(booster): - return json.loads(booster.save_config()) - - -def get_catboost_params(booster): - dump_filename = f"catboost_model_{getpid()}_{time()}" - - # Dump model in file - booster.save_model(dump_filename, "json") - - # Read json with model - with open(dump_filename) as file: - model_data = json.load(file) - - # Delete dump file - remove(dump_filename) - return model_data - - -def get_gbt_model_from_lightgbm(model: Any, lgb_model=None) -> Any: - if lgb_model is None: - lgb_model = get_lightgbm_params(model) - - n_features = lgb_model["max_feature_idx"] + 1 - n_iterations = len(lgb_model["tree_info"]) / lgb_model["num_tree_per_iteration"] - n_classes = lgb_model["num_tree_per_iteration"] - - is_regression = False - objective_fun = lgb_model["objective"] - if n_classes > 2: - if "multiclass" not in objective_fun: - raise TypeError( - "multiclass (softmax) objective is only supported for multiclass classification" - ) - elif "binary" in objective_fun: # nClasses == 1 - n_classes = 2 - else: - is_regression = True - - class_label = 0 - - tree_list = ... - - return get_gbt_model_from_tree_list(tree_list, n_iterations=n_iterations, is_regression=is_regression, n_features=n_features, n_classes=n_classes, base_score=base_score, add_base_score_as_tree=False) - - -def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: - # Release Note for XGBoost 1.5.0: Python interface now supports configuring - # constraints using feature names instead of feature indices. This also - # helps with pandas input with set feature names. - booster.feature_names = [str(i) for i in range(booster.num_features())] - - if xgb_config is None: - xgb_config = get_xgboost_params(booster) - - n_features = int(xgb_config["learner"]["learner_model_param"]["num_feature"]) - n_classes = int(xgb_config["learner"]["learner_model_param"]["num_class"]) - base_score = float(xgb_config["learner"]["learner_model_param"]["base_score"]) - - is_regression = False - objective_fun = xgb_config["learner"]["learner_train_param"]["objective"] - if n_classes > 2: - if objective_fun not in ["multi:softprob", "multi:softmax"]: - raise TypeError( - "multi:softprob and multi:softmax are only supported for multiclass classification" - ) - elif objective_fun.find("binary:") == 0: - if objective_fun in ["binary:logistic", "binary:logitraw"]: - n_classes = 2 - else: - raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification" - ) - else: - is_regression = True - - n_iterations = booster.best_iteration + 1 - - tree_list = TreeList.from_booster(booster) - - return get_gbt_model_from_tree_list(tree_list, n_iterations=n_iterations, is_regression=is_regression, n_features=n_features, n_classes=n_classes, base_score=base_score, add_base_score_as_tree=True) - - -def get_gbt_model_from_tree_list(tree_list: TreeList, n_iterations: int, is_regression: bool, n_features: int, n_classes: int, base_score: float, add_base_score_as_tree: bool) - """Return a GBT Model from TreeList""" - - if is_regression: - if add_base_score_as_tree: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations + 1) - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) - else: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) - else: - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - - class_label = 0 - for counter, tree in enumerate(tree_list, start=1): - # find out the number of nodes in the tree - if is_regression: - tree_id = mb.create_tree(tree.n_nodes) - else: - tree_id = mb.create_tree(n_nodes=tree.n_nodes, class_label=class_label) - - if counter % n_iterations == 0: - class_label += 1 - - if tree.is_leaf: - mb.add_leaf(tree_id=tree_id, response=tree.value, cover=tree.cover) - continue - - root_node = tree.root_node - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=root_node.feature, - feature_value=root_node.get_value_closest_float_downward(), - cover=root_node.cover, - default_left=root_node.default_left, - ) - - # create queue - node_queue: Deque[Node] = deque() - children = root_node.get_children() - assert children is not None - for position, child in enumerate(children): - child.parent_id = parent_id - child.position = position - node_queue.append(child) - - while node_queue: - node = node_queue.popleft() - assert node.parent_id != -1, "node.parent_id must not be -1" - assert node.position != -1, "node.position must not be -1" - - if node.is_leaf: - mb.add_leaf( - tree_id=tree_id, - response=node.value, - cover=node.cover, - parent_id=node.parent_id, - position=node.position, - ) - else: - parent_id = mb.add_split( - tree_id=tree_id, - feature_index=node.feature, - feature_value=node.get_value_closest_float_downward(), - cover=node.cover, - default_left=node.default_left, - parent_id=node.parent_id, - position=node.position, - ) - - children = node.get_children() - assert children is not None - for position, child in enumerate(children): - child.parent_id = parent_id - child.position = position - node_queue.append(child) - - return mb.model() - - -def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: - if not model.is_fitted(): - raise RuntimeError("Model should be fitted before exporting to daal4py.") - - if model_data is None: - model_data = get_catboost_params(model) - - if "categorical_features" in model_data["features_info"]: - raise NotImplementedError( - "Categorical features are not supported in daal4py Gradient Boosting Trees" - ) - - n_features = len(model_data["features_info"]["float_features"]) - - is_symmetric_tree = ( - model_data["model_info"]["params"]["tree_learner_options"]["grow_policy"] - == "SymmetricTree" - ) - - if is_symmetric_tree: - n_iterations = len(model_data["oblivious_trees"]) - else: - n_iterations = len(model_data["trees"]) - - n_classes = 0 - - if "class_params" in model_data["model_info"]: - is_classification = True - n_classes = len(model_data["model_info"]["class_params"]["class_to_label"]) - mb = gbt_clf_model_builder( - n_features=n_features, n_iterations=n_iterations, n_classes=n_classes - ) - else: - is_classification = False - mb = gbt_reg_model_builder(n_features, n_iterations) - - splits = [] - - # Create splits array (all splits are placed sequentially) - for feature in model_data["features_info"]["float_features"]: - if feature["borders"]: - for feature_border in feature["borders"]: - splits.append( - {"feature_index": feature["feature_index"], "value": feature_border} - ) - - if not is_classification: - bias = model_data["scale_and_bias"][1][0] / n_iterations - scale = model_data["scale_and_bias"][0] - else: - bias = 0 - scale = 1 - - trees_explicit = [] - tree_symmetric = [] - - if ( - model_data["model_info"]["params"]["data_processing_options"][ - "float_features_binarization" - ]["nan_mode"] - == "Min" - ): - default_left = 1 - else: - default_left = 0 - - for tree_num in range(n_iterations): - if is_symmetric_tree: - if model_data["oblivious_trees"][tree_num]["splits"] is not None: - # Tree has more than 1 node - cur_tree_depth = len(model_data["oblivious_trees"][tree_num]["splits"]) - else: - cur_tree_depth = 0 - - tree_symmetric.append( - (model_data["oblivious_trees"][tree_num], cur_tree_depth) - ) - else: - n_nodes = 1 - # Check if node is a leaf (in case of stump) - if "split" in model_data["trees"][tree_num]: - # Get number of trees and splits info via BFS - # Create queue - nodes_queue = [] - root_node = CatBoostNode( - split=splits[model_data["trees"][tree_num]["split"]["split_index"]] - ) - nodes_queue.append((model_data["trees"][tree_num], root_node)) - while nodes_queue: - cur_node_data, cur_node = nodes_queue.pop(0) - if "value" in cur_node_data: - if isinstance(cur_node_data["value"], list): - cur_node.value = [value for value in cur_node_data["value"]] - else: - cur_node.value = [cur_node_data["value"] * scale + bias] - else: - cur_node.split = splits[cur_node_data["split"]["split_index"]] - left_node = CatBoostNode() - right_node = CatBoostNode() - cur_node.left = left_node - cur_node.right = right_node - nodes_queue.append((cur_node_data["left"], left_node)) - nodes_queue.append((cur_node_data["right"], right_node)) - n_nodes += 2 - else: - root_node = CatBoostNode() - if is_classification and n_classes > 2: - root_node.value = [ - value * scale for value in model_data["trees"][tree_num]["value"] - ] - else: - root_node.value = [ - model_data["trees"][tree_num]["value"] * scale + bias - ] - trees_explicit.append((root_node, n_nodes)) - - tree_id = [] - class_label = 0 - count = 0 - - # Only 1 tree for each iteration in case of regression or binary classification - if not is_classification or n_classes == 2: - n_tree_each_iter = 1 - else: - n_tree_each_iter = n_classes - - # Create id for trees (for the right order in modelbuilder) - for i in range(n_iterations): - for c in range(n_tree_each_iter): - if is_symmetric_tree: - n_nodes = 2 ** (tree_symmetric[i][1] + 1) - 1 - else: - n_nodes = trees_explicit[i][1] - - if is_classification and n_classes > 2: - tree_id.append(mb.create_tree(n_nodes, class_label)) - count += 1 - if count == n_iterations: - class_label += 1 - count = 0 - - elif is_classification: - tree_id.append(mb.create_tree(n_nodes, 0)) - else: - tree_id.append(mb.create_tree(n_nodes)) - - if is_symmetric_tree: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - cur_tree_info = tree_symmetric[i][0] - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - cur_tree_leaf_val = cur_tree_info["leaf_values"] - cur_tree_depth = tree_symmetric[i][1] - - if cur_tree_depth == 0: - mb.add_leaf(tree_id=cur_tree_id, response=cur_tree_leaf_val[0]) - else: - # One split used for the whole level - cur_level_split = splits[ - cur_tree_info["splits"][cur_tree_depth - 1]["split_index"] - ] - root_id = mb.add_split( - tree_id=cur_tree_id, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - ) - prev_level_nodes = [root_id] - - # Iterate over levels, splits in json are reversed (root split is the last) - for cur_level in range(cur_tree_depth - 2, -1, -1): - cur_level_nodes = [] - for cur_parent in prev_level_nodes: - cur_level_split = splits[ - cur_tree_info["splits"][cur_level]["split_index"] - ] - cur_left_node = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_parent, - position=0, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - ) - cur_right_node = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_parent, - position=1, - feature_index=cur_level_split["feature_index"], - feature_value=cur_level_split["value"], - default_left=default_left, - ) - cur_level_nodes.append(cur_left_node) - cur_level_nodes.append(cur_right_node) - prev_level_nodes = cur_level_nodes - - # Different storing format for leaves - if not is_classification or n_classes == 2: - for last_level_node_num in range(len(prev_level_nodes)): - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[2 * last_level_node_num] - * scale - + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=0, - ) - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[2 * last_level_node_num + 1] - * scale - + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=1, - ) - else: - for last_level_node_num in range(len(prev_level_nodes)): - left_index = ( - 2 * last_level_node_num * n_tree_each_iter + class_label - ) - right_index = ( - 2 * last_level_node_num + 1 - ) * n_tree_each_iter + class_label - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[left_index] * scale + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=0, - ) - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_tree_leaf_val[right_index] * scale + bias, - parent_id=prev_level_nodes[last_level_node_num], - position=1, - ) - else: - for class_label in range(n_tree_each_iter): - for i in range(n_iterations): - root_node = trees_explicit[i][0] - - cur_tree_id = tree_id[i * n_tree_each_iter + class_label] - # Traverse tree via BFS and build tree with modelbuilder - if root_node.value is None: - root_id = mb.add_split( - tree_id=cur_tree_id, - feature_index=root_node.split["feature_index"], - feature_value=root_node.split["value"], - default_left=default_left, - ) - nodes_queue = [(root_node, root_id)] - while nodes_queue: - cur_node, cur_node_id = nodes_queue.pop(0) - left_node = cur_node.left - # Check if node is a leaf - if left_node.value is None: - left_node_id = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_node_id, - position=0, - feature_index=left_node.split["feature_index"], - feature_value=left_node.split["value"], - default_left=default_left, - ) - nodes_queue.append((left_node, left_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, - response=left_node.value[class_label], - parent_id=cur_node_id, - position=0, - ) - right_node = cur_node.right - # Check if node is a leaf - if right_node.value is None: - right_node_id = mb.add_split( - tree_id=cur_tree_id, - parent_id=cur_node_id, - position=1, - feature_index=right_node.split["feature_index"], - feature_value=right_node.split["value"], - default_left=default_left, - ) - nodes_queue.append((right_node, right_node_id)) - else: - mb.add_leaf( - tree_id=cur_tree_id, - response=cur_node.right.value[class_label], - parent_id=cur_node_id, - position=1, - ) - - else: - # Tree has only one node - mb.add_leaf( - tree_id=cur_tree_id, response=root_node.value[class_label] - ) - - return mb.model() From bdac1f5f15adfc583428ce991452bb905acf91da Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 10 Oct 2023 08:10:01 -0700 Subject: [PATCH 30/64] Start work on fixing LightGBM model builder test cases --- src/gbt_convertors.pyx | 3 +- tests/test_model_builders.py | 88 ++++++++++++++++++------------------ 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index f0c5edf2e7..ac87e27440 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -112,7 +112,7 @@ class Node: return Node( cover=tree["leaf_count"] if is_leaf else tree["internal_count"], is_leaf=is_leaf, - default_left=is_leaf or tree["default_left"], + default_left=tree.get("default_left", 0), feature=tree.get("split_feature"), value=tree["leaf_value"] if is_leaf else tree["threshold"], n_children=n_children, @@ -301,6 +301,7 @@ def get_gbt_model_from_tree_list( position=node.position, ) else: + print(f"add split, {node.default_left=}") parent_id = mb.add_split( tree_id=tree_id, feature_index=node.feature, diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index ecb464d58f..28dde5ab75 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -248,7 +248,7 @@ def setUpClass(cls): "task": "train", "boosting": "gbdt", "objective": "regression", - "num_leaves": 10, + "num_leaves": 4, "learning_rage": 0.05, "metric": {"l2", "l1"}, "verbose": -1, @@ -267,50 +267,48 @@ def test_model_predict(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test) lgbm_pred = self.lgbm_model.predict(self.X_test) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) def test_missing_value_support(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan) lgbm_pred = self.lgbm_model.predict(self.X_nan) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) - - def test_model_predict_shap_contribs(self): - m = d4p.mb.convert_model(self.lgbm_model) - d4p_pred = m.predict(self.X_test, pred_contribs=True) - lgbm_pred = self.lgbm_model.predict(self.X_test, pred_contrib=True) - self.assertTrue( - d4p_pred.shape == lgbm_pred.shape, - f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", - ) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) - - def test_model_predict_shap_interactions(self): - m = d4p.mb.convert_model(self.lgbm_model) - # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column - d4p_pred = m.predict(self.X_test, pred_interactions=True)[:, :-1, :-1] - explainer = shap.TreeExplainer(self.lgbm_model) - shap_pred = explainer.shap_interaction_values(self.X_test) - self.assertTrue( - d4p_pred.shape == shap_pred.shape, - f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {shap_pred.shape}", - ) - max_diff = np.absolute(d4p_pred - shap_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) - - def test_model_predict_shap_contribs_missing_values(self): - m = d4p.mb.convert_model(self.lgbm_model) - d4p_pred = m.predict(self.X_nan, pred_contribs=True) - lgbm_pred = self.lgbm_model.predict(self.X_nan, pred_contrib=True) - self.assertTrue( - d4p_pred.shape == lgbm_pred.shape, - f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", - ) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + # def test_model_predict_shap_contribs(self): + # m = d4p.mb.convert_model(self.lgbm_model) + # d4p_pred = m.predict(self.X_test, pred_contribs=True) + # lgbm_pred = self.lgbm_model.predict(self.X_test, pred_contrib=True) + # self.assertTrue( + # d4p_pred.shape == lgbm_pred.shape, + # f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + # ) + # max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + # self.assertLess(max_diff, 1e-7) + + # def test_model_predict_shap_interactions(self): + # m = d4p.mb.convert_model(self.lgbm_model) + # # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column + # d4p_pred = m.predict(self.X_test, pred_interactions=True)[:, :-1, :-1] + # explainer = shap.TreeExplainer(self.lgbm_model) + # shap_pred = explainer.shap_interaction_values(self.X_test) + # self.assertTrue( + # d4p_pred.shape == shap_pred.shape, + # f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {shap_pred.shape}", + # ) + # max_diff = np.absolute(d4p_pred - shap_pred).reshape(1, -1).max() + # self.assertLess(max_diff, 1e-7) + + # def test_model_predict_shap_contribs_missing_values(self): + # m = d4p.mb.convert_model(self.lgbm_model) + # d4p_pred = m.predict(self.X_nan, pred_contribs=True) + # lgbm_pred = self.lgbm_model.predict(self.X_nan, pred_contrib=True) + # self.assertTrue( + # d4p_pred.shape == lgbm_pred.shape, + # f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + # ) + # max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + # self.assertLess(max_diff, 1e-7) class LightGBMClassificationModelBuilder(unittest.TestCase): @@ -336,19 +334,21 @@ def test_model_conversion(self): m = d4p.mb.convert_model(self.lgbm_model) self.assertEqual(m.n_classes_, 3) self.assertEqual(m.n_features_in_, 10) - self.assertTrue(m._is_regression) + self.assertFalse(m._is_regression) def test_model_predict(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test) lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1) - self.assertTrue((d4p_pred == lgbm_pred).all()) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) def test_missing_value_support(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan) - lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1) - self.assertTrue((d4p_pred == lgbm_pred).all()) + lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_nan), axis=1) + max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() + self.assertLess(max_diff, 1e-7) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) From cb0d895b6547d3f143f11c192f50029806671bc0 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 12 Oct 2023 09:46:14 -0700 Subject: [PATCH 31/64] Properly use XGBoost's base_score parameter --- src/gbt_convertors.pyx | 45 +++---- src/gbt_model_builder.h | 16 +++ src/gbt_model_builder.pyx | 14 +- tests/test_model_builders.py | 251 ++++++++++++++++++++++------------- 4 files changed, 204 insertions(+), 122 deletions(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index ac87e27440..6fa6c50ba9 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -235,20 +235,12 @@ def get_gbt_model_from_tree_list( is_regression: bool, n_features: int, n_classes: int, - base_score: float, - add_base_score_as_tree: bool, + base_score: Optional[float] = None, ): """Return a GBT Model from TreeList""" if is_regression: - if add_base_score_as_tree: - mb = gbt_reg_model_builder( - n_features=n_features, n_iterations=n_iterations + 1 - ) - tree_id = mb.create_tree(1) - mb.add_leaf(tree_id=tree_id, response=base_score, cover=1) - else: - mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) + mb = gbt_reg_model_builder(n_features=n_features, n_iterations=n_iterations) else: mb = gbt_clf_model_builder( n_features=n_features, n_iterations=n_iterations, n_classes=n_classes @@ -301,7 +293,6 @@ def get_gbt_model_from_tree_list( position=node.position, ) else: - print(f"add split, {node.default_left=}") parent_id = mb.add_split( tree_id=tree_id, feature_index=node.feature, @@ -319,7 +310,7 @@ def get_gbt_model_from_tree_list( child.position = position node_queue.append(child) - return mb.model() + return mb.model(base_score=base_score) def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any: @@ -350,8 +341,6 @@ def get_gbt_model_from_lightgbm(model: Any, booster=None) -> Any: is_regression=is_regression, n_features=n_features, n_classes=n_classes, - base_score=0, - add_base_score_as_tree=False, ) @@ -375,19 +364,28 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: raise TypeError( "multi:softprob and multi:softmax are only supported for multiclass classification" ) - elif objective_fun.find("binary:") == 0: - if objective_fun in ["binary:logistic", "binary:logitraw"]: - n_classes = 2 - else: + elif objective_fun.startswith("binary:"): + if objective_fun not in ["binary:logistic", "binary:logitraw"]: raise TypeError( - "binary:logistic and binary:logitraw are only supported for binary classification" + "only binary:logistic and binary:logitraw are supported for binary classification" + ) + n_classes = 2 + if objective_fun == "binary:logitraw": + # daal4py always applies a sigmoid for pred_proba, wheres XGBoost + # returns raw predictions with logitraw + warn( + "objective='binary:logitraw' selected\n" + "XGBoost returns raw class scores when calling pred_proba()\n" + "whilst scikit-learn-intelex always uses binary:logistic\n" ) + if base_score != 0.5: + warn("objective='binary:logitraw' ignores base_score, fixing base_score to 0.5") + base_score = 0.5 else: is_regression = True - max_trees = ( - getattr(booster, "best_iteration", -1) + 1 - ) # 0 if best_iteration does not exist + # max_trees=0 if best_iteration does not exist + max_trees = getattr(booster, "best_iteration", -1) + 1 if n_classes > 2: max_trees *= n_classes tree_list = TreeList.from_xgb_booster(booster, max_trees) @@ -404,7 +402,6 @@ def get_gbt_model_from_xgboost(booster: Any, xgb_config=None) -> Any: n_features=n_features, n_classes=n_classes, base_score=base_score, - add_base_score_as_tree=True, ) @@ -720,4 +717,4 @@ def get_gbt_model_from_catboost(model: Any, model_data=None) -> Any: ) warn("Models converted from CatBoost cannot be used for SHAP value calculation") - return mb.model() + return mb.model(0.0) diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index 7ebcc32838..8fb30ecd65 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -25,6 +25,7 @@ #if (((MAJOR_VERSION == 2024) && (MINOR_VERSION == 0) && (UPDATE_VERSION >= 1)) || ((MAJOR_VERSION > 2024) && (MINOR_VERSION >= 1))) // added missing value support to GBT regression // added SHAP value support + // added base_score parameter #define _gbt_inference_api_version 2 #elif (((MAJOR_VERSION == 2023) && (MINOR_VERSION >= 2)) || (MAJOR_VERSION > 2023)) // added missing value support to GBT classification @@ -44,6 +45,20 @@ typedef c_gbt_regression_model_builder::TreeId c_gbt_reg_tree_id; #define c_gbt_clf_no_parent c_gbt_classification_model_builder::noParent #define c_gbt_reg_no_parent c_gbt_regression_model_builder::noParent +#if (_gbt_inference_api_version == 2) +static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_, double base_score) +{ + daal::algorithms::gbt::classification::ModelPtr * ptr = RAW()(obj_->getModel()); + ptr->get()->setPredictionBias(base_score); + return ptr; +} +static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_builder_model(daal::algorithms::gbt::regression::ModelBuilder * obj_, double base_score) +{ + daal::algorithms::gbt::regression::ModelPtr * ptr = RAW()(obj_->getModel()); + ptr->get()->setPredictionBias(base_score); + return ptr; +} +#else static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_) { return RAW()(obj_->getModel()); @@ -53,6 +68,7 @@ static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_bu { return RAW()(obj_->getModel()); } +#endif c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) { diff --git a/src/gbt_model_builder.pyx b/src/gbt_model_builder.pyx index e51b43fd32..1639d00552 100644 --- a/src/gbt_model_builder.pyx +++ b/src/gbt_model_builder.pyx @@ -27,8 +27,8 @@ cdef extern from "gbt_model_builder.h": cdef size_t c_gbt_clf_no_parent cdef size_t c_gbt_reg_no_parent - cdef gbt_classification_ModelPtr * get_gbt_classification_model_builder_model(c_gbt_classification_model_builder *) - cdef gbt_regression_ModelPtr * get_gbt_regression_model_builder_model(c_gbt_regression_model_builder *) + cdef gbt_classification_ModelPtr * get_gbt_classification_model_builder_model(c_gbt_classification_model_builder *, double base_score) + cdef gbt_regression_ModelPtr * get_gbt_regression_model_builder_model(c_gbt_regression_model_builder *, double base_score) cdef cppclass c_gbt_classification_model_builder: c_gbt_classification_model_builder(size_t nFeatures, size_t nIterations, size_t nClasses) except + @@ -93,14 +93,15 @@ cdef class gbt_classification_model_builder: ''' return clfAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover) - def model(self): + def model(self, base_score): ''' Get built model + :param double base_score: global prediction bias (used e.g. in XGBoost) :rtype: gbt_classification_model ''' cdef gbt_classification_model res = gbt_classification_model.__new__(gbt_classification_model) - res.c_ptr = get_gbt_classification_model_builder_model(self.c_ptr) + res.c_ptr = get_gbt_classification_model_builder_model(self.c_ptr, base_score or 0.0) return res @@ -153,14 +154,15 @@ cdef class gbt_regression_model_builder: ''' return regAddSplitNodeWrapper(self.c_ptr, tree_id, parent_id, position, feature_index, feature_value, default_left, cover) - def model(self): + def model(self, base_score): ''' Get built model + :param double base_score: global prediction bias (used e.g. in XGBoost) :rtype: gbt_regression_model ''' cdef gbt_regression_model res = gbt_regression_model.__new__(gbt_regression_model) - res.c_ptr = get_gbt_regression_model_builder_model(self.c_ptr) + res.c_ptr = get_gbt_regression_model_builder_model(self.c_ptr, base_score or 0.0) return res diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 28dde5ab75..3cf857d17a 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -105,11 +105,13 @@ def test_breast_cancer_without_intercept(self): class XGBoostRegressionModelBuilder(unittest.TestCase): @classmethod - def setUpClass(cls): + def setUpClass(cls, base_score=0.5): X, y = make_regression(n_samples=2, n_features=10, random_state=42) cls.X_test = X[:2, :] cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) - cls.xgb_model = xgb.XGBRegressor(max_depth=5, n_estimators=50, random_state=42) + cls.xgb_model = xgb.XGBRegressor( + max_depth=5, n_estimators=50, random_state=42, base_score=base_score + ) cls.xgb_model.fit(X, y) def test_model_conversion(self): @@ -124,19 +126,13 @@ def test_model_predict(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) d4p_pred = m.predict(self.X_test) xgboost_pred = self.xgb_model.predict(self.X_test) - self.assertTrue( - np.allclose(d4p_pred, xgboost_pred, atol=1e-7), - f"d4p and reference prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", - ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) def test_missing_value_support(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) d4p_pred = m.predict(self.X_nan) xgboost_pred = self.xgb_model.predict(self.X_nan) - self.assertTrue( - np.allclose(d4p_pred, xgboost_pred, atol=1e-7), - f"d4p and reference missing value prediction different (d4p - ref) = {d4p_pred - xgboost_pred}", - ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() @@ -152,10 +148,7 @@ def test_model_predict_shap_contribs(self): d4p_pred.shape == xgboost_pred.shape, f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {xgboost_pred.shape}", ) - self.assertTrue( - np.allclose(d4p_pred, xgboost_pred, atol=1e-7), - f"d4p and reference SHAP contribution prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", - ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() @@ -171,10 +164,7 @@ def test_model_predict_shap_interactions(self): d4p_pred.shape == xgboost_pred.shape, f"d4p and reference SHAP interaction shape is different {d4p_pred.shape} != {xgboost_pred.shape}", ) - self.assertTrue( - np.allclose(d4p_pred, xgboost_pred, atol=1e-7), - f"d4p and reference SHAP interaction prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", - ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) def test_model_predict_shap_contribs_missing_values(self): booster = self.xgb_model.get_booster() @@ -186,44 +176,75 @@ def test_model_predict_shap_contribs_missing_values(self): approx_contribs=False, validate_features=False, ) - self.assertTrue( - np.allclose(d4p_pred, xgboost_pred, atol=1e-7), - f"d4p and reference SHAP contribution missing value prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", - ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=5e-6) -class XGBoostClassificationModelBuilder(unittest.TestCase): +# duplicate all tests for bae_score=0.0 +class XGBoostRegressionModelBuilder_base_score0(XGBoostRegressionModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostRegressionModelBuilder.setUpClass(0) + + +# duplicate all tests for bae_score=100 +class XGBoostRegressionModelBuilder_base_score100(XGBoostRegressionModelBuilder): @classmethod def setUpClass(cls): - X, y = make_classification(n_samples=500, n_features=10, random_state=42) + XGBoostRegressionModelBuilder.setUpClass(100) + + +class XGBoostClassificationModelBuilder(unittest.TestCase): + @classmethod + def setUpClass(cls, base_score=0.5, n_classes=2, objective="binary:logistic"): + n_features = 15 + cls.base_score = base_score + cls.n_classes = n_classes + X, y = make_classification( + n_samples=500, + n_classes=n_classes, + n_features=n_features, + n_informative=10, + random_state=42, + ) cls.X_test = X[:2, :] - cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) - cls.xgb_model = xgb.XGBClassifier(max_depth=5, n_estimators=50, random_state=42) + cls.X_nan = np.array([np.nan] * 2 * n_features, dtype=np.float32).reshape( + 2, n_features + ) + cls.xgb_model = xgb.XGBClassifier( + max_depth=5, + n_estimators=50, + random_state=42, + base_score=base_score, + objective=objective, + ) cls.xgb_model.fit(X, y) def test_model_conversion(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) - self.assertEqual(m.n_classes_, 2) - self.assertEqual(m.n_features_in_, 10) + self.assertEqual(m.n_classes_, self.n_classes) + self.assertEqual(m.n_features_in_, 15) self.assertFalse(m._is_regression) def test_model_predict(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) d4p_pred = m.predict(self.X_test) xgboost_pred = self.xgb_model.predict(self.X_test) - self.assertTrue( - np.allclose(d4p_pred, xgboost_pred, atol=1e-7), - f"d4p and reference prediction are different (d4p - ref) = {d4p_pred - xgboost_pred}", - ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7) + + def test_model_predict_proba(self): + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict_proba(self.X_test) + xgboost_pred = self.xgb_model.predict_proba(self.X_test) + # calculating probas involves multiple exp / ln operations, therefore + # they're quite susceptible to small numerical changes and we have to + # accept an rtol of 1e-5 + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5) def test_missing_value_support(self): m = d4p.mb.convert_model(self.xgb_model.get_booster()) d4p_pred = m.predict(self.X_nan) xgboost_pred = self.xgb_model.predict(self.X_nan) - self.assertTrue( - np.allclose(d4p_pred, xgboost_pred, atol=1e-7), - f"d4p and reference missing value prediction different (d4p - ref) = {d4p_pred - xgboost_pred}", - ) + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() @@ -238,12 +259,65 @@ def test_model_predict_shap_interactions(self): m.predict(self.X_test, pred_contribs=True) +# duplicate all tests for bae_score=0.01 +class XGBoostClassificationModelBuilder_base_score03(XGBoostClassificationModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(base_score=0.3) + + +# duplicate all tests for bae_score=0.99 +class XGBoostClassificationModelBuilder_base_score07(XGBoostClassificationModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(base_score=0.7) + + +class XGBoostClassificationModelBuilder_n_classes5(XGBoostClassificationModelBuilder): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(n_classes=5) + + +class XGBoostClassificationModelBuilder_objective_logitraw( + XGBoostClassificationModelBuilder +): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass( + base_score=0.5, n_classes=2, objective="binary:logitraw" + ) + + def test_model_predict_proba(self): + # overload this function because daal4py always applies the sigmoid + # for bias 0.5, we can still check if the original scores are correct + with self.assertWarns(UserWarning): + # expect a warning that logitraw behaves differently and/or + # that base_score is ignored / fixed to 0.5 + m = d4p.mb.convert_model(self.xgb_model.get_booster()) + d4p_pred = m.predict_proba(self.X_test) + # undo sigmoid + d4p_pred = np.log(-d4p_pred / (d4p_pred - 1)) + # undo bias + d4p_pred += 0.5 + xgboost_pred = self.xgb_model.predict_proba(self.X_test) + # calculating probas involves multiple exp / ln operations, therefore + # they're quite susceptible to small numerical changes and we have to + # accept an rtol of 1e-5 + np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5) + + class LightGBMRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): X, y = make_regression(n_samples=100, n_features=10, random_state=42) cls.X_test = X[:2, :] cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + # LightGBM requires a couple of NaN values in the training data to properly set + # the missing value type to NaN + # https://github.com/microsoft/LightGBM/issues/6139 + X_train = np.concatenate([cls.X_nan, X]) + y_train = np.concatenate([[0, 0], y]) params = { "task": "train", "boosting": "gbdt", @@ -252,13 +326,12 @@ def setUpClass(cls): "learning_rage": 0.05, "metric": {"l2", "l1"}, "verbose": -1, + "n_estimators": 1, } - cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X, y)) + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train)) def test_model_conversion(self): m = d4p.mb.convert_model(self.lgbm_model) - # XGBoost treats regression as 0 classes, LightGBM 1 class - # For us, it does not make a difference and both are acceptable self.assertEqual(m.n_classes_, 1) self.assertEqual(m.n_features_in_, 10) self.assertTrue(m._is_regression) @@ -267,48 +340,50 @@ def test_model_predict(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test) lgbm_pred = self.lgbm_model.predict(self.X_test) - np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) def test_missing_value_support(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan) lgbm_pred = self.lgbm_model.predict(self.X_nan) - np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=5e-6) + + def test_model_predict_shap_contribs(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test, pred_contribs=True) + explainer = shap.TreeExplainer(self.lgbm_model) + shap_pred = explainer(self.X_test).values + lgbm_pred = self.lgbm_model.predict(self.X_test, pred_contrib=True) + self.assertTrue( + d4p_pred.shape == lgbm_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred[:, :-1], shap_pred, rtol=1e-6) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) + + def test_model_predict_shap_interactions(self): + m = d4p.mb.convert_model(self.lgbm_model) + # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column + d4p_pred = m.predict(self.X_test, pred_interactions=True)[:, :-1, :-1] + explainer = shap.TreeExplainer(self.lgbm_model) + shap_pred = explainer.shap_interaction_values(self.X_test) + self.assertTrue( + d4p_pred.shape == shap_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {shap_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred, shap_pred, rtol=1e-6) - # def test_model_predict_shap_contribs(self): - # m = d4p.mb.convert_model(self.lgbm_model) - # d4p_pred = m.predict(self.X_test, pred_contribs=True) - # lgbm_pred = self.lgbm_model.predict(self.X_test, pred_contrib=True) - # self.assertTrue( - # d4p_pred.shape == lgbm_pred.shape, - # f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", - # ) - # max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - # self.assertLess(max_diff, 1e-7) - - # def test_model_predict_shap_interactions(self): - # m = d4p.mb.convert_model(self.lgbm_model) - # # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column - # d4p_pred = m.predict(self.X_test, pred_interactions=True)[:, :-1, :-1] - # explainer = shap.TreeExplainer(self.lgbm_model) - # shap_pred = explainer.shap_interaction_values(self.X_test) - # self.assertTrue( - # d4p_pred.shape == shap_pred.shape, - # f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {shap_pred.shape}", - # ) - # max_diff = np.absolute(d4p_pred - shap_pred).reshape(1, -1).max() - # self.assertLess(max_diff, 1e-7) - - # def test_model_predict_shap_contribs_missing_values(self): - # m = d4p.mb.convert_model(self.lgbm_model) - # d4p_pred = m.predict(self.X_nan, pred_contribs=True) - # lgbm_pred = self.lgbm_model.predict(self.X_nan, pred_contrib=True) - # self.assertTrue( - # d4p_pred.shape == lgbm_pred.shape, - # f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", - # ) - # max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - # self.assertLess(max_diff, 1e-7) + # Will revisit further LightGBM validation after resolving microsoft/LightGBM#6139 + # @unittest.skipIf(lgbm.__version__ == "4.1.0", "LightGBM models from dump_model() are possibly broken in v4.1.0") + def test_model_predict_shap_contribs_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan, pred_contribs=True) + lgbm_pred = self.lgbm_model.predict(self.X_nan, pred_contrib=True) + self.assertTrue( + d4p_pred.shape == lgbm_pred.shape, + f"d4p and reference SHAP contribution shape is different {d4p_pred.shape} != {lgbm_pred.shape}", + ) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) class LightGBMClassificationModelBuilder(unittest.TestCase): @@ -340,15 +415,13 @@ def test_model_predict(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test) lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) def test_missing_value_support(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan) lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_nan), axis=1) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) @@ -397,15 +470,13 @@ def test_model_predict(self): m = d4p.mb.convert_model(self.cb_model) d4p_pred = m.predict(self.X_test) lgbm_pred = self.cb_model.predict(self.X_test) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) def test_missing_value_support(self): m = d4p.mb.convert_model(self.cb_model) d4p_pred = m.predict(self.X_nan) lgbm_pred = self.cb_model.predict(self.X_nan) - max_diff = np.absolute(d4p_pred - lgbm_pred).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added @@ -449,13 +520,13 @@ def test_model_predict(self): m = d4p.mb.convert_model(self.cb_model) d4p_pred = m.predict(self.X_test) cb_pred = self.cb_model.predict(self.X_test, prediction_type="Class").T[0] - self.assertTrue((d4p_pred == cb_pred).all()) + np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7) def test_missing_value_support(self): m = d4p.mb.convert_model(self.cb_model) d4p_pred = m.predict(self.X_nan) cb_pred = self.cb_model.predict(self.X_nan, prediction_type="Class").T[0] - self.assertTrue((d4p_pred == cb_pred).all()) + np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added @@ -514,8 +585,8 @@ def test_early_stopping(self): daal_errors_count = np.count_nonzero(daal_prediction - np.ravel(self.y_test)) self.assertTrue(np.absolute(xgb_errors_count - daal_errors_count) == 0) - max_diff = np.absolute(xgb_proba - daal_proba).reshape(1, -1).max() - self.assertLess(max_diff, 1e-7) + + np.testing.assert_allclose(xgb_proba, daal_proba, rtol=1e-6) class ModelBuilderTreeView(unittest.TestCase): @@ -529,7 +600,7 @@ def get_dump(self, *_, **kwargs): ] mock = MockBooster() - result = d4p.TreeList.from_xgb_booster(mock) + result = d4p.TreeList.from_xgb_booster(mock, max_trees=0) self.assertEqual(len(result), 2) tree0 = result[0] @@ -542,10 +613,6 @@ def get_dump(self, *_, **kwargs): self.assertIsInstance(tree0.root_node, d4p.Node) - self.assertEqual(tree0.root_node.node_id, 0) - self.assertEqual(tree0.root_node.left_child.node_id, 1) - self.assertEqual(tree0.root_node.right_child.node_id, 2) - self.assertEqual(tree0.root_node.cover, 4) self.assertEqual(tree0.root_node.left_child.cover, 6) self.assertEqual(tree0.root_node.right_child.cover, 8) From 4ef7712cf3544b21b069dded5574af099fc0a32f Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 19 Oct 2023 03:32:47 -0700 Subject: [PATCH 32/64] fix: parse enums declared with bit shifting --- generator/parse.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/generator/parse.py b/generator/parse.py index a0fda84af3..6611c59c94 100644 --- a/generator/parse.py +++ b/generator/parse.py @@ -283,8 +283,14 @@ def parse(self, elem, ctxt): ctxt.enum = False return True regex = ( - r"^\s*(\w+)(?:\s*=\s*((\(int\))?\w(\w|:|\s|\+)*))?" - + r"(\s*,)?\s*((/\*|//).*)?$" + # capture group for value name + r"^\s*(\w+)" + # capture group for value (different possible formats, 123, 0x1, (1 << 5), etc.) + + r"(?:\s*=\s*((\(int\))?(\w|:|\s|\+|\(?\d+\s*<<\s*\d+\)?)*))?" + # comma after the value, plus possible comments + + r"(\s*,)?\s*((/\*|//).*)?" + # EOL + + r"$" ) me = re.match(regex, elem) if me and not me.group(1).startswith("last"): From bba8beb57aec1ad8b3b8b6a7a04ea9f4dc9fdf5a Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 19 Oct 2023 04:03:06 -0700 Subject: [PATCH 33/64] refactor: SHAP prediction replace boolean parameters with DAAL_UINT64 flag --- daal4py/mb/model_builders.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index a10b8f0d41..1dbc146275 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -215,10 +215,14 @@ def _predict_regression( ) # Prediction + resultsToCompute = "" + if pred_contribs: + resultsToCompute = "shapContributions" + elif pred_interactions: + resultsToCompute = "shapInteractions" + predict_algo = d4p.gbt_regression_prediction( - fptype=fptype, - predShapContributions=pred_contribs, - predShapInteractions=pred_interactions, + fptype=fptype, resultsToCompute=resultsToCompute ) predict_result = predict_algo.compute(X, self.daal_model_) From 8298633b8393576a360fba7a54e83ee5b528f1a7 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 19 Oct 2023 04:03:48 -0700 Subject: [PATCH 34/64] chore: fix typos and add another classification test --- tests/test_model_builders.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 3cf857d17a..13339ab083 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -259,14 +259,14 @@ def test_model_predict_shap_interactions(self): m.predict(self.X_test, pred_contribs=True) -# duplicate all tests for bae_score=0.01 +# duplicate all tests for bae_score=0.3 class XGBoostClassificationModelBuilder_base_score03(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(base_score=0.3) -# duplicate all tests for bae_score=0.99 +# duplicate all tests for bae_score=0.7 class XGBoostClassificationModelBuilder_base_score07(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): @@ -279,6 +279,14 @@ def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(n_classes=5) +class XGBoostClassificationModelBuilder_n_classes5_base_score03( + XGBoostClassificationModelBuilder +): + @classmethod + def setUpClass(cls): + XGBoostClassificationModelBuilder.setUpClass(n_classes=5, base_score=0.3) + + class XGBoostClassificationModelBuilder_objective_logitraw( XGBoostClassificationModelBuilder ): From 576552393b80d647052b31264cf9cd7afa397cd3 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 19 Oct 2023 06:47:23 -0700 Subject: [PATCH 35/64] feat: add more tests for LightGBM models --- tests/test_model_builders.py | 79 +++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 13339ab083..4c833f0e07 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -402,6 +402,8 @@ def setUpClass(cls): ) cls.X_test = X[:2, :] cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + X_train = np.concatenate([cls.X_nan, X]) + y_train = np.concatenate([[0, 0], y]) params = { "n_estimators": 10, "task": "train", @@ -411,7 +413,7 @@ def setUpClass(cls): "num_class": 3, "verbose": -1, } - cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X, y)) + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train)) def test_model_conversion(self): m = d4p.mb.convert_model(self.lgbm_model) @@ -425,6 +427,12 @@ def test_model_predict(self): lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_test), axis=1) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + def test_model_predict_proba(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict_proba(self.X_test) + lgbm_pred = self.lgbm_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + def test_missing_value_support(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan) @@ -447,6 +455,75 @@ def test_model_predict_shap_contribs_missing_values(self): m.predict(self.X_nan, pred_contribs=True) +class LightGBMClassificationModelBuilder_binaryClassification(unittest.TestCase): + @classmethod + def setUpClass(cls): + X, y = make_classification( + random_state=3, n_classes=2, n_informative=3, n_features=10 + ) + cls.X_test = X[:2, :] + cls.X_nan = np.array([np.nan] * 20, dtype=np.float32).reshape(2, 10) + X_train = np.concatenate([cls.X_nan, X]) + y_train = np.concatenate([[0, 0], y]) + params = { + "n_estimators": 10, + "task": "train", + "boosting": "gbdt", + "objective": "binary", + "metric": "binary_logloss", + "num_leaves": 4, + "verbose": -1, + } + cls.lgbm_model = lgbm.train(params, train_set=lgbm.Dataset(X_train, y_train)) + + def test_model_conversion(self): + m = d4p.mb.convert_model(self.lgbm_model) + self.assertEqual(m.n_classes_, 2) + self.assertEqual(m.n_features_in_, 10) + self.assertFalse(m._is_regression) + + def test_model_predict(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_test) + lgbm_pred = np.round(self.lgbm_model.predict(self.X_test)).astype(int) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_proba(self): + m = d4p.mb.convert_model(self.lgbm_model) + # predict proba of being class 1 + d4p_pred = m.predict_proba(self.X_test)[:, 1] + lgbm_pred = self.lgbm_model.predict(self.X_test) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_missing_value_support(self): + m = d4p.mb.convert_model(self.lgbm_model) + d4p_pred = m.predict(self.X_nan) + lgbm_pred = np.round(self.lgbm_model.predict(self.X_nan)).astype(int) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_proba_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + # predict proba of being class 1 + d4p_pred = m.predict_proba(self.X_nan)[:, 1] + lgbm_pred = self.lgbm_model.predict(self.X_nan) + np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + + def test_model_predict_shap_contribs(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_contribs=True) + + def test_model_predict_shap_interactions(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_test, pred_interactions=True) + + def test_model_predict_shap_contribs_missing_values(self): + m = d4p.mb.convert_model(self.lgbm_model) + with self.assertRaises(NotImplementedError): + m.predict(self.X_nan, pred_contribs=True) + + class CatBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): From 30fbafad7b9dca5a89f09bf449ef5d84bfeb3bf3 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 23 Oct 2023 00:06:46 -0700 Subject: [PATCH 36/64] fix LightGBM model conversion --- src/gbt_convertors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gbt_convertors.pyx b/src/gbt_convertors.pyx index 6fa6c50ba9..c031e983ee 100755 --- a/src/gbt_convertors.pyx +++ b/src/gbt_convertors.pyx @@ -26,7 +26,7 @@ import numpy as np class CatBoostNode: def __init__( self, - split: Optional[float] = None, + split: Optional[Dict] = None, value: Optional[List[float]] = None, right: Optional[int] = None, left: Optional[float] = None, From 65fbe168eedd6312a064719e562a624f30649e7b Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 23 Oct 2023 00:56:57 -0700 Subject: [PATCH 37/64] feat: provide XGBoost SHAP example --- .../daal4py/model_builders_xgboost_shap.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 examples/daal4py/model_builders_xgboost_shap.py diff --git a/examples/daal4py/model_builders_xgboost_shap.py b/examples/daal4py/model_builders_xgboost_shap.py new file mode 100644 index 0000000000..82954dc2a7 --- /dev/null +++ b/examples/daal4py/model_builders_xgboost_shap.py @@ -0,0 +1,81 @@ +# ============================================================================== +# Copyright 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# daal4py Gradient Boosting Classification model creation and SHAP value +# prediction example + +import numpy as np +import xgboost as xgb +from scipy.stats import chisquare +from sklearn.datasets import make_regression +from sklearn.model_selection import train_test_split + +import daal4py as d4p + + +def main(): + # create data + X, y = make_regression(n_samples=10000, n_features=10, random_state=42) + X_train, X_test, y_train, _ = train_test_split(X, y, random_state=42) + + # train the model + xgb_model = xgb.XGBRegressor( + max_depth=6, n_estimators=100, random_state=42, base_score=0.5 + ) + xgb_model.fit(X_train, y_train) + + # Conversion to daal4py + daal_model = d4p.mb.convert_model(xgb_model.get_booster()) + + # SHAP contributions + daal_contribs = daal_model.predict(X_test, pred_contribs=True) + + # SHAP interactions + daal_interactions = daal_model.predict(X_test, pred_interactions=True) + + # XGBoost reference values + xgb_contribs = xgb_model.get_booster().predict( + xgb.DMatrix(X_test), pred_contribs=True, validate_features=False + ) + xgb_interactions = xgb_model.get_booster().predict( + xgb.DMatrix(X_test), pred_interactions=True, validate_features=False + ) + + return ( + daal_contribs, + daal_interactions, + xgb_contribs, + xgb_interactions, + ) + + +if __name__ == "__main__": + daal_contribs, daal_interactions, xgb_contribs, xgb_interactions = main() + print(f"XGBoost SHAP contributions shape: {xgb_contribs.shape}") + print(f"daal4py SHAP contributions shape: {daal_contribs.shape}") + + print(f"XGBoost SHAP interactions shape: {xgb_interactions.shape}") + print(f"daal4py SHAP interactions shape: {daal_interactions.shape}") + + contribution_rmse = np.sqrt( + np.mean((daal_contribs.reshape(-1, 1) - xgb_contribs.reshape(-1, 1)) ** 2) + ) + print(f"SHAP contributions RMSE: {contribution_rmse:.2e}") + + interaction_rmse = np.sqrt( + np.mean((daal_interactions.reshape(-1, 1) - xgb_interactions.reshape(-1, 1)) ** 2) + ) + print(f"SHAP interactions RMSE: {interaction_rmse:.2e}") From 8e89e0ae8a8eae8508032f6f61c86072894df9bb Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 23 Oct 2023 00:57:23 -0700 Subject: [PATCH 38/64] clean imports --- examples/daal4py/model_builders_xgboost_shap.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/daal4py/model_builders_xgboost_shap.py b/examples/daal4py/model_builders_xgboost_shap.py index 82954dc2a7..0302c191a9 100644 --- a/examples/daal4py/model_builders_xgboost_shap.py +++ b/examples/daal4py/model_builders_xgboost_shap.py @@ -19,7 +19,6 @@ import numpy as np import xgboost as xgb -from scipy.stats import chisquare from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split From 4a14e86b75890d9bcece17b8a28058660b3c6672 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 23 Oct 2023 02:39:02 -0700 Subject: [PATCH 39/64] Include SHAP description --- doc/daal4py/model-builders.rst | 54 +++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/doc/daal4py/model-builders.rst b/doc/daal4py/model-builders.rst index 3d1f9e7b26..c676e0662c 100644 --- a/doc/daal4py/model-builders.rst +++ b/doc/daal4py/model-builders.rst @@ -24,17 +24,17 @@ Model Builders for the Gradient Boosting Frameworks Introduction ------------------ -Gradient boosting on decision trees is one of the most accurate and efficient -machine learning algorithms for classification and regression. -The most popular implementations of it are: +Gradient boosting on decision trees is one of the most accurate and efficient +machine learning algorithms for classification and regression. +The most popular implementations of it are: * XGBoost* * LightGBM* * CatBoost* daal4py Model Builders deliver the accelerated -models inference of those frameworks. The inference is performed by the oneDAL GBT implementation tuned -for the best performance on the Intel(R) Architecture. +models inference of those frameworks. The inference is performed by the oneDAL GBT implementation tuned +for the best performance on the Intel(R) Architecture. Conversion --------- @@ -61,22 +61,47 @@ CatBoost:: Classification and Regression Inference ---------------------------------------- -The API is the same for classification and regression inference. -Based on the original model passed to the ``convert_model``, ``d4p_prediction`` is either the classification or regression output. - +The API is the same for classification and regression inference. +Based on the original model passed to the ``convert_model()``, ``d4p_prediction`` is either the classification or regression output. + :: - + d4p_prediction = d4p_model.predict(test_data) Here, the ``predict()`` method of ``d4p_model`` is being used to make predictions on the ``test_data`` dataset. -The ``d4p_prediction`` variable stores the predictions made by the ``predict()`` method. +The ``d4p_prediction`` variable stores the predictions made by the ``predict()`` method. + +SHAP Value Calculation for Regression Models +---------------------------------------- + +SHAP contribution and interaction value calculation is natively supported by models created with daal4py Model Builders. +For these models, the ``predict()`` method takes additional keyword arguments. + + :: + + d4p_model.predict(test_data, pred_contribs=True) # for SHAP contributions + d4p_model.predict(test_data, pred_interactions=True) # for SHAP interactions + +The returned prediction will have shape ``(n_rows, n_features + 1)`` and ``n_rows, n_features + 1, n_features + 1`` for +SHAP contributions and interactions, respectively. Here, ``n_rows`` is the number of rows (i.e., observations) in +``test_data``, and ``n_features`` is the number of features in the dataset. + +The prediction result for SHAP contributions comprises one feature attribution value per feature and a bias term for +each observation. + +The prediction result for SHAP interactions comprises ``(n_features + 1) x (n_features + 1)`` values for all possible +feature combinations, as well as the accompanying bias terms. + +.. note:: The shapes of SHAP contributions and interactions are consistent with XGBoost's* results. + In contrast, the popular `SHAP Python package* `_ drops bias terms, resulting + in SHAP contributions (SHAP interactions) with one fewer column (one fewer column and row) per observation. Scikit-learn-style Estimators -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can also use the scikit-learn-style classes ``GBTDAALClassifier`` and ``GBTDAALRegressor`` to convert and infer your models. For example: -:: +:: from daal4py.sklearn.ensemble import GBTDAALRegressor reg = xgb.XGBRegressor() @@ -88,9 +113,9 @@ Limitations ------------------ Model Builders support only base inference with prediction and probabilities prediction. The functionality is to be extended. Therefore, there are the following limitations: -- The categorical features are not supported for conversion and prediction. +- The categorical features are not supported for conversion and prediction. - The multioutput models are not supported for conversion and prediction. -- The tree SHAP calculations are not supported. +- SHAP values can only be calculated for regression models. Examples @@ -98,6 +123,7 @@ Examples Model Builders models conversion - `XGBoost model conversion `_ +- `SHAP value prediction from an XGBoost model ` - `LightGBM model conversion `_ - `CatBoost model conversion `_ From 8c3e0e7e500dcf7828dbed1649da79879917a81a Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 23 Oct 2023 02:41:41 -0700 Subject: [PATCH 40/64] typos --- doc/daal4py/model-builders.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/daal4py/model-builders.rst b/doc/daal4py/model-builders.rst index c676e0662c..7f3fd28b01 100644 --- a/doc/daal4py/model-builders.rst +++ b/doc/daal4py/model-builders.rst @@ -82,8 +82,8 @@ For these models, the ``predict()`` method takes additional keyword arguments. d4p_model.predict(test_data, pred_contribs=True) # for SHAP contributions d4p_model.predict(test_data, pred_interactions=True) # for SHAP interactions -The returned prediction will have shape ``(n_rows, n_features + 1)`` and ``n_rows, n_features + 1, n_features + 1`` for -SHAP contributions and interactions, respectively. Here, ``n_rows`` is the number of rows (i.e., observations) in +The returned prediction will have shape ``(n_rows, n_features + 1)`` and ``(n_rows, n_features + 1, n_features + 1)`` +for SHAP contributions and interactions, respectively. Here, ``n_rows`` is the number of rows (i.e., observations) in ``test_data``, and ``n_features`` is the number of features in the dataset. The prediction result for SHAP contributions comprises one feature attribution value per feature and a bias term for @@ -123,7 +123,7 @@ Examples Model Builders models conversion - `XGBoost model conversion `_ -- `SHAP value prediction from an XGBoost model ` +- `SHAP value prediction from an XGBoost model `_ - `LightGBM model conversion `_ - `CatBoost model conversion `_ From 38559ed5d9df8b0709f87ab3361c961dfe4d4457 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Mon, 23 Oct 2023 03:13:59 -0700 Subject: [PATCH 41/64] chore: move model builder examples to dedicated directory --- examples/{daal4py => model_builders}/log_reg_model_builder.py | 0 examples/{daal4py => model_builders}/model_builders_catboost.py | 0 examples/{daal4py => model_builders}/model_builders_lightgbm.py | 0 examples/{daal4py => model_builders}/model_builders_xgboost.py | 0 .../{daal4py => model_builders}/model_builders_xgboost_shap.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename examples/{daal4py => model_builders}/log_reg_model_builder.py (100%) rename examples/{daal4py => model_builders}/model_builders_catboost.py (100%) rename examples/{daal4py => model_builders}/model_builders_lightgbm.py (100%) rename examples/{daal4py => model_builders}/model_builders_xgboost.py (100%) rename examples/{daal4py => model_builders}/model_builders_xgboost_shap.py (100%) diff --git a/examples/daal4py/log_reg_model_builder.py b/examples/model_builders/log_reg_model_builder.py similarity index 100% rename from examples/daal4py/log_reg_model_builder.py rename to examples/model_builders/log_reg_model_builder.py diff --git a/examples/daal4py/model_builders_catboost.py b/examples/model_builders/model_builders_catboost.py similarity index 100% rename from examples/daal4py/model_builders_catboost.py rename to examples/model_builders/model_builders_catboost.py diff --git a/examples/daal4py/model_builders_lightgbm.py b/examples/model_builders/model_builders_lightgbm.py similarity index 100% rename from examples/daal4py/model_builders_lightgbm.py rename to examples/model_builders/model_builders_lightgbm.py diff --git a/examples/daal4py/model_builders_xgboost.py b/examples/model_builders/model_builders_xgboost.py similarity index 100% rename from examples/daal4py/model_builders_xgboost.py rename to examples/model_builders/model_builders_xgboost.py diff --git a/examples/daal4py/model_builders_xgboost_shap.py b/examples/model_builders/model_builders_xgboost_shap.py similarity index 100% rename from examples/daal4py/model_builders_xgboost_shap.py rename to examples/model_builders/model_builders_xgboost_shap.py From bd78635bc9989b01288668948308a3df1fe94c00 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 24 Oct 2023 01:58:48 -0700 Subject: [PATCH 42/64] rename model_builders -> mb --- examples/{model_builders => mb}/log_reg_model_builder.py | 0 examples/{model_builders => mb}/model_builders_catboost.py | 0 examples/{model_builders => mb}/model_builders_lightgbm.py | 0 examples/{model_builders => mb}/model_builders_xgboost.py | 0 examples/{model_builders => mb}/model_builders_xgboost_shap.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename examples/{model_builders => mb}/log_reg_model_builder.py (100%) rename examples/{model_builders => mb}/model_builders_catboost.py (100%) rename examples/{model_builders => mb}/model_builders_lightgbm.py (100%) rename examples/{model_builders => mb}/model_builders_xgboost.py (100%) rename examples/{model_builders => mb}/model_builders_xgboost_shap.py (100%) diff --git a/examples/model_builders/log_reg_model_builder.py b/examples/mb/log_reg_model_builder.py similarity index 100% rename from examples/model_builders/log_reg_model_builder.py rename to examples/mb/log_reg_model_builder.py diff --git a/examples/model_builders/model_builders_catboost.py b/examples/mb/model_builders_catboost.py similarity index 100% rename from examples/model_builders/model_builders_catboost.py rename to examples/mb/model_builders_catboost.py diff --git a/examples/model_builders/model_builders_lightgbm.py b/examples/mb/model_builders_lightgbm.py similarity index 100% rename from examples/model_builders/model_builders_lightgbm.py rename to examples/mb/model_builders_lightgbm.py diff --git a/examples/model_builders/model_builders_xgboost.py b/examples/mb/model_builders_xgboost.py similarity index 100% rename from examples/model_builders/model_builders_xgboost.py rename to examples/mb/model_builders_xgboost.py diff --git a/examples/model_builders/model_builders_xgboost_shap.py b/examples/mb/model_builders_xgboost_shap.py similarity index 100% rename from examples/model_builders/model_builders_xgboost_shap.py rename to examples/mb/model_builders_xgboost_shap.py From 53afa0f295cca0eb0a7f9b31e31f8a0073c6b7fe Mon Sep 17 00:00:00 2001 From: Andreas Huber <9201869+ahuber21@users.noreply.github.com> Date: Tue, 24 Oct 2023 15:55:42 +0200 Subject: [PATCH 43/64] Apply suggestions from code review Co-authored-by: Alexandra --- doc/daal4py/model-builders.rst | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/doc/daal4py/model-builders.rst b/doc/daal4py/model-builders.rst index 7f3fd28b01..4e89b2a849 100644 --- a/doc/daal4py/model-builders.rst +++ b/doc/daal4py/model-builders.rst @@ -72,28 +72,30 @@ Here, the ``predict()`` method of ``d4p_model`` is being used to make prediction The ``d4p_prediction`` variable stores the predictions made by the ``predict()`` method. SHAP Value Calculation for Regression Models ----------------------------------------- +------------------------------------------------------------ -SHAP contribution and interaction value calculation is natively supported by models created with daal4py Model Builders. -For these models, the ``predict()`` method takes additional keyword arguments. +SHAP contribution and interaction value calculation are natively supported by models created with daal4py Model Builders. +For these models, the ``predict()`` method takes additional keyword arguments: :: d4p_model.predict(test_data, pred_contribs=True) # for SHAP contributions d4p_model.predict(test_data, pred_interactions=True) # for SHAP interactions -The returned prediction will have shape ``(n_rows, n_features + 1)`` and ``(n_rows, n_features + 1, n_features + 1)`` -for SHAP contributions and interactions, respectively. Here, ``n_rows`` is the number of rows (i.e., observations) in +The returned prediction has the shape: + + * ``(n_rows, n_features + 1)`` for SHAP contributions + * ``(n_rows, n_features + 1, n_features + 1)`` for SHAP interactions +Here, ``n_rows`` is the number of rows (i.e., observations) in ``test_data``, and ``n_features`` is the number of features in the dataset. -The prediction result for SHAP contributions comprises one feature attribution value per feature and a bias term for -each observation. +The prediction result for SHAP contributions includes a feature attribution value for each feature and a bias term for each observation. The prediction result for SHAP interactions comprises ``(n_features + 1) x (n_features + 1)`` values for all possible -feature combinations, as well as the accompanying bias terms. +feature combinations, along with their corresponding bias terms. -.. note:: The shapes of SHAP contributions and interactions are consistent with XGBoost's* results. - In contrast, the popular `SHAP Python package* `_ drops bias terms, resulting +.. note:: The shapes of SHAP contributions and interactions are consistent with the XGBoost results. + In contrast, the `SHAP Python package `_ drops bias terms, resulting in SHAP contributions (SHAP interactions) with one fewer column (one fewer column and row) per observation. Scikit-learn-style Estimators @@ -115,7 +117,7 @@ Model Builders support only base inference with prediction and probabilities pre Therefore, there are the following limitations: - The categorical features are not supported for conversion and prediction. - The multioutput models are not supported for conversion and prediction. -- SHAP values can only be calculated for regression models. +- SHAP values can be calculated for regression models only. Examples From 11b492e0090f85d64a36a709f752694c195a7252 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Tue, 24 Oct 2023 09:13:21 -0700 Subject: [PATCH 44/64] add reg/clf leaf node wrappers for backwards compatibility --- src/gbt_model_builder.h | 18 ++++++++++++++++++ src/gbt_model_builder.pyx | 7 +++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index 8fb30ecd65..76047813d2 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -92,4 +92,22 @@ c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, #endif } +c_gbt_clf_node_id clfAddLeafNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) +{ +#if (_gbt_inference_api_version == 2) + return c_ptr->addLeafNode(treeId, parentId, position, response, cover); +#else + return c_ptr->addLeafNode(treeId, parentId, position, response); +#endif +} + +c_gbt_reg_node_id regAddLeafNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) +{ +#if (_gbt_inference_api_version == 2) + return c_ptr->addLeafNode(treeId, parentId, position, response, cover); +#else + return c_ptr->addLeafNode(treeId, parentId, position, response); +#endif +} + #endif // _GBT_MODEL_BUILDER_INCLUDED_ diff --git a/src/gbt_model_builder.pyx b/src/gbt_model_builder.pyx index 1639d00552..418390a4ec 100644 --- a/src/gbt_model_builder.pyx +++ b/src/gbt_model_builder.pyx @@ -43,6 +43,9 @@ cdef extern from "gbt_model_builder.h": cdef c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) cdef c_gbt_reg_node_id regAddSplitNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) + cdef c_gbt_clf_node_id clfAddLeafNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, double response, double cover) + cdef c_gbt_clf_node_id regAddLeafNodeWrapper(c_gbt_regression_model_builder * c_ptr, c_gbt_reg_tree_id treeId, c_gbt_reg_node_id parentId, size_t position, double response, double cover) + cdef class gbt_classification_model_builder: ''' Model Builder for gradient boosted trees. @@ -76,7 +79,7 @@ cdef class gbt_classification_model_builder: :param double cover: cover (sum_hess) of the leaf node :rtype: node identifier ''' - return self.c_ptr.addLeafNode(tree_id, parent_id, position, response, cover) + return clfAddLeafNodeWrapper(self.c_ptr, tree_id, parent_id, position, response, cover) def add_split(self, c_gbt_clf_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_clf_node_id parent_id=c_gbt_clf_no_parent, size_t position=0): ''' @@ -137,7 +140,7 @@ cdef class gbt_regression_model_builder: :param double cover: cover (sum_hess) of the leaf node :rtype: node identifier ''' - return self.c_ptr.addLeafNode(tree_id, parent_id, position, response, cover) + return regAddLeafNodeWrapper(self.c_ptr, tree_id, parent_id, position, response, cover) def add_split(self, c_gbt_reg_tree_id tree_id, size_t feature_index, double feature_value, int default_left, double cover, c_gbt_reg_node_id parent_id=c_gbt_reg_no_parent, size_t position=0): ''' From 43b4d2a0f41ccc712be62c4a695cc7c162d45a48 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 00:24:28 -0700 Subject: [PATCH 45/64] fix: model retrieve API --- src/gbt_model_builder.h | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/gbt_model_builder.h b/src/gbt_model_builder.h index 76047813d2..7a99b07b8c 100644 --- a/src/gbt_model_builder.h +++ b/src/gbt_model_builder.h @@ -45,30 +45,22 @@ typedef c_gbt_regression_model_builder::TreeId c_gbt_reg_tree_id; #define c_gbt_clf_no_parent c_gbt_classification_model_builder::noParent #define c_gbt_reg_no_parent c_gbt_regression_model_builder::noParent -#if (_gbt_inference_api_version == 2) static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_, double base_score) { daal::algorithms::gbt::classification::ModelPtr * ptr = RAW()(obj_->getModel()); +#if (_gbt_inference_api_version == 2) ptr->get()->setPredictionBias(base_score); +#endif return ptr; } static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_builder_model(daal::algorithms::gbt::regression::ModelBuilder * obj_, double base_score) { daal::algorithms::gbt::regression::ModelPtr * ptr = RAW()(obj_->getModel()); +#if (_gbt_inference_api_version == 2) ptr->get()->setPredictionBias(base_score); +#endif return ptr; } -#else -static daal::algorithms::gbt::classification::ModelPtr * get_gbt_classification_model_builder_model(daal::algorithms::gbt::classification::ModelBuilder * obj_) -{ - return RAW()(obj_->getModel()); -} - -static daal::algorithms::gbt::regression::ModelPtr * get_gbt_regression_model_builder_model(daal::algorithms::gbt::regression::ModelBuilder * obj_) -{ - return RAW()(obj_->getModel()); -} -#endif c_gbt_clf_node_id clfAddSplitNodeWrapper(c_gbt_classification_model_builder * c_ptr, c_gbt_clf_tree_id treeId, c_gbt_clf_node_id parentId, size_t position, size_t featureIndex, double featureValue, int defaultLeft, double cover) { From da24ef9a43541bb6ad757eb4d0a26a1e7ac84421 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 01:02:25 -0700 Subject: [PATCH 46/64] chore: remove requirements-test-optional.txt --- .ci/pipeline/build-and-test-mac.yml | 2 +- .ci/pipeline/build-and-test-win.yml | 2 +- .ci/pipeline/nightly.yml | 2 +- requirements-test-optional.txt | 4 ---- requirements-test.txt | 4 ++-- 5 files changed, 5 insertions(+), 9 deletions(-) delete mode 100644 requirements-test-optional.txt diff --git a/.ci/pipeline/build-and-test-mac.yml b/.ci/pipeline/build-and-test-mac.yml index c9f6d05345..0df12bc5d0 100644 --- a/.ci/pipeline/build-and-test-mac.yml +++ b/.ci/pipeline/build-and-test-mac.yml @@ -40,7 +40,7 @@ steps: - script: | source activate CB bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) - pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt + pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) pip list displayName: 'Install testing requirements' diff --git a/.ci/pipeline/build-and-test-win.yml b/.ci/pipeline/build-and-test-win.yml index 1bf9d2e365..2875513cb3 100644 --- a/.ci/pipeline/build-and-test-win.yml +++ b/.ci/pipeline/build-and-test-win.yml @@ -43,7 +43,7 @@ steps: set PATH=C:\msys64\usr\bin;%PATH% call activate CB bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) - pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt + pip install --upgrade -r requirements-test.txt cd .. for /f "delims=" %%c in ('python s\.ci\scripts\get_compatible_scipy_version.py') do set SCIPY_VERSION=%%c pip install %SCIPY_VERSION% diff --git a/.ci/pipeline/nightly.yml b/.ci/pipeline/nightly.yml index d6ea8393e4..7c3e707cfe 100644 --- a/.ci/pipeline/nightly.yml +++ b/.ci/pipeline/nightly.yml @@ -64,7 +64,7 @@ jobs: conda activate CB pip install -r dependencies-dev pip install -r requirements-doc.txt - pip install -r requirements-test.txt -r requirements-test-optional.txt + pip install -r requirements-test.txt pip install jupyter matplotlib requests displayName: 'Install requirements' - script: | diff --git a/requirements-test-optional.txt b/requirements-test-optional.txt deleted file mode 100644 index 45e2575ef4..0000000000 --- a/requirements-test-optional.txt +++ /dev/null @@ -1,4 +0,0 @@ -xgboost==1.7.6; python_version <= '3.9' -xgboost==2.0.0; python_version >= '3.10' -lightgbm==4.1.0 -catboost==1.2.2; python_version <= '3.11' diff --git a/requirements-test.txt b/requirements-test.txt index a34e1b2cd0..8e3d520d3d 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,5 +9,5 @@ pandas==2.1.1 ; python_version >= '3.9' xgboost==1.7.6; python_version <= '3.9' xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 -catboost==1.2.1 -shap==0.42.1 +catboost==1.2.2; python_version <= '3.11' +catboost>=1.2.2; python_version >= '3.12' From f311e27d4a19e0c8d5c8dc083520fc942aea227d Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 01:29:50 -0700 Subject: [PATCH 47/64] Update CODEOWNERS after removing requirements-test-optional.txt --- .github/CODEOWNERS | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index ceb64309c4..53dc619061 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -13,17 +13,16 @@ requirements-doc.txt @maria-Petrova @napetrov @aepanchi @Alexsandruss onedal/ @Alexsandruss @samir-nasibli @KulikovNikita sklearnex/ @Alexsandruss @samir-nasibli @KulikovNikita -# Examples +# Examples examples/ @maria-Petrova @Alexsandruss @samir-nasibli @napetrov # Dependencies setup.py @napetrov @Alexsandruss @samir-nasibli requirements* @napetrov @Alexsandruss @samir-nasibli @homksei @ahuber21 @ethanglaser -conda-recipe/ @napetrov @Alexsandruss +conda-recipe/ @napetrov @Alexsandruss # Model builders *model_builders* @razdoburdin @ahuber21 @avolkov-intel -requirements-test-optional.txt @razdoburdin @ahuber21 @avolkov-intel # Forests *ensemble* @ahuber21 @icfaust From 523d160065de738220a4d5e2e8f422ec7e10c49d Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 02:35:16 -0700 Subject: [PATCH 48/64] fix: add new mb path to test_examples sys.path --- tests/test_examples.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index a9e8adaf23..00f300a47d 100755 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -19,9 +19,11 @@ test_path = os.path.abspath(os.path.dirname(__file__)) unittest_data_path = os.path.join(test_path, "unittest_data") -examples_path = os.path.join(os.path.dirname(test_path), "examples", "daal4py") -sys.path.insert(0, examples_path) -os.chdir(examples_path) +daal4py_examples_path = os.path.join(os.path.dirname(test_path), "examples", "daal4py") +mb_examples_path = os.path.join(os.path.dirname(test_path), "examples", "mb") +sys.path.insert(0, daal4py_examples_path) +sys.path.insert(0, mb_examples_path) +os.chdir(daal4py_examples_path) import unittest From 3c586100d9bd19f8fb6a730245d012fc6d2d9eb7 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 02:37:50 -0700 Subject: [PATCH 49/64] feat: add xgboost_shap example to testing for 2024.0.1 --- tests/test_examples.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_examples.py b/tests/test_examples.py index 00f300a47d..19ea78f7a6 100755 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -272,6 +272,13 @@ def test_svm(self): ((2020, "P", 2), (2021, "B", 109)), ["xgboost"], ), + ( + "model_builders_xgboost_shap", + None, + None, + (2023, "P", 1), + ["xgboost"], + ), ("model_builders_catboost", None, None, (2021, "P", 4), ["catboost"]), ("gradient_boosted_classification",), ("gradient_boosted_regression",), From a7170af6310419430e04f48df6338c34a44d430c Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 03:09:04 -0700 Subject: [PATCH 50/64] fix: add shap to test requirements --- requirements-test.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-test.txt b/requirements-test.txt index 8e3d520d3d..8b2c436e98 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -11,3 +11,4 @@ xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 catboost==1.2.2; python_version <= '3.11' catboost>=1.2.2; python_version >= '3.12' +shap==0.42.1 From 6a01c63aafc14e883d2fe42e5bdc7969e6b3fff1 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 03:09:25 -0700 Subject: [PATCH 51/64] Skip SHAP checks for older versions --- tests/test_model_builders.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 4c833f0e07..eddae27a37 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -19,6 +19,7 @@ import catboost as cb import lightgbm as lgbm import numpy as np +import pytest import shap import xgboost as xgb from sklearn.datasets import ( @@ -31,6 +32,13 @@ from sklearn.model_selection import train_test_split import daal4py as d4p +from daal4py.sklearn._utils import daal_check_version + +shap_required_version = (2024, "P", 1) +shap_supported = daal_check_version(shap_required_version) +shap_not_supported_str = ( + f"SHAP value calculation only supported for version {shap_required_version} or later" +) class LogRegModelBuilder(unittest.TestCase): @@ -134,6 +142,7 @@ def test_missing_value_support(self): xgboost_pred = self.xgb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -150,6 +159,7 @@ def test_model_predict_shap_contribs(self): ) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -166,6 +176,7 @@ def test_model_predict_shap_interactions(self): ) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -246,12 +257,14 @@ def test_missing_value_support(self): xgboost_pred = self.xgb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -356,6 +369,7 @@ def test_missing_value_support(self): lgbm_pred = self.lgbm_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=5e-6) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test, pred_contribs=True) @@ -369,6 +383,7 @@ def test_model_predict_shap_contribs(self): np.testing.assert_allclose(d4p_pred[:, :-1], shap_pred, rtol=1e-6) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column @@ -381,8 +396,7 @@ def test_model_predict_shap_interactions(self): ) np.testing.assert_allclose(d4p_pred, shap_pred, rtol=1e-6) - # Will revisit further LightGBM validation after resolving microsoft/LightGBM#6139 - # @unittest.skipIf(lgbm.__version__ == "4.1.0", "LightGBM models from dump_model() are possibly broken in v4.1.0") + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan, pred_contribs=True) @@ -439,16 +453,19 @@ def test_missing_value_support(self): lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_nan), axis=1) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_interactions=True) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): @@ -508,16 +525,19 @@ def test_model_predict_proba_missing_values(self): lgbm_pred = self.lgbm_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_interactions=True) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): @@ -563,6 +583,7 @@ def test_missing_value_support(self): lgbm_pred = self.cb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added with self.assertWarnsRegex( @@ -613,6 +634,7 @@ def test_missing_value_support(self): cb_pred = self.cb_model.predict(self.X_nan, prediction_type="Class").T[0] np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7) + @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added with self.assertWarnsRegex( From f7031a1edbfc2ce9047802fe99a4d65d1f39b07c Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 03:28:15 -0700 Subject: [PATCH 52/64] fixup: skip shap tests if *not* daal_check_version(...) --- tests/test_model_builders.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index eddae27a37..d6106ffc3c 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -35,7 +35,7 @@ from daal4py.sklearn._utils import daal_check_version shap_required_version = (2024, "P", 1) -shap_supported = daal_check_version(shap_required_version) +shap_not_supported = not daal_check_version(shap_required_version) shap_not_supported_str = ( f"SHAP value calculation only supported for version {shap_required_version} or later" ) @@ -142,7 +142,7 @@ def test_missing_value_support(self): xgboost_pred = self.xgb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -159,7 +159,7 @@ def test_model_predict_shap_contribs(self): ) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -176,7 +176,7 @@ def test_model_predict_shap_interactions(self): ) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -257,14 +257,14 @@ def test_missing_value_support(self): xgboost_pred = self.xgb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -369,7 +369,7 @@ def test_missing_value_support(self): lgbm_pred = self.lgbm_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=5e-6) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test, pred_contribs=True) @@ -383,7 +383,7 @@ def test_model_predict_shap_contribs(self): np.testing.assert_allclose(d4p_pred[:, :-1], shap_pred, rtol=1e-6) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column @@ -396,7 +396,7 @@ def test_model_predict_shap_interactions(self): ) np.testing.assert_allclose(d4p_pred, shap_pred, rtol=1e-6) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan, pred_contribs=True) @@ -453,19 +453,19 @@ def test_missing_value_support(self): lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_nan), axis=1) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_interactions=True) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): @@ -525,19 +525,19 @@ def test_model_predict_proba_missing_values(self): lgbm_pred = self.lgbm_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_interactions=True) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): @@ -583,7 +583,7 @@ def test_missing_value_support(self): lgbm_pred = self.cb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added with self.assertWarnsRegex( @@ -634,7 +634,7 @@ def test_missing_value_support(self): cb_pred = self.cb_model.predict(self.X_nan, prediction_type="Class").T[0] np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7) - @pytest.mark.skipif(shap_supported, reason=shap_not_supported_str) + @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added with self.assertWarnsRegex( From e029fc84ba30e0b01de56c0a67e5427660e1ad24 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 04:02:31 -0700 Subject: [PATCH 53/64] Let main() accept args and kwargs --- examples/mb/model_builders_xgboost_shap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mb/model_builders_xgboost_shap.py b/examples/mb/model_builders_xgboost_shap.py index 0302c191a9..7780714fd5 100644 --- a/examples/mb/model_builders_xgboost_shap.py +++ b/examples/mb/model_builders_xgboost_shap.py @@ -25,7 +25,7 @@ import daal4py as d4p -def main(): +def main(*ars, **kwargs): # create data X, y = make_regression(n_samples=10000, n_features=10, random_state=42) X_train, X_test, y_train, _ = train_test_split(X, y, random_state=42) From 02aaf333a5ff45bfb79d4793e9b8f57bcbcf8524 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 04:22:23 -0700 Subject: [PATCH 54/64] fix: only request resultsToCompute with compatible versions --- daal4py/mb/model_builders.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index 1dbc146275..b580daab9a 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -214,7 +214,30 @@ def _predict_regression( ).format(type(self).__name__) ) - # Prediction + try: + return self._predict_regression_with_results_to_compute( + X, fptype, pred_contribs, pred_interactions + ) + except TypeError as e: + if "unexpected keyword argument 'resultsToCompute'" in str(e): + if pred_contribs or pred_interactions: + # SHAP values requested, but not supported by this version + raise TypeError( + f"{'pred_contribs' if pred_contribs else 'pred_interactions'} not supported by this version of daalp4y" + ) + else: + # unknown type error + raise + + # fallback to calculation without `resultsToCompute` + predict_algo = d4p.gbt_regression_prediction(fptype=fptype) + predict_result = predict_algo.compute(X, self.daal_model_) + return predict_result.prediction.ravel() + + def _predict_regression_with_results_to_compute( + self, X, fptype, pred_contribs=False, pred_interactions=False + ): + """Assume daal4py supports the resultsToCompute kwarg""" resultsToCompute = "" if pred_contribs: resultsToCompute = "shapContributions" From 014306761ab2e522a0aeb31dfa61b22167216b53 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 05:46:57 -0700 Subject: [PATCH 55/64] fixup: better error reporting --- daal4py/mb/model_builders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daal4py/mb/model_builders.py b/daal4py/mb/model_builders.py index b580daab9a..bad2eddf19 100644 --- a/daal4py/mb/model_builders.py +++ b/daal4py/mb/model_builders.py @@ -224,7 +224,7 @@ def _predict_regression( # SHAP values requested, but not supported by this version raise TypeError( f"{'pred_contribs' if pred_contribs else 'pred_interactions'} not supported by this version of daalp4y" - ) + ) from e else: # unknown type error raise From 41cda26b25c7a5a39fe76d193a8376c0057cfe62 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 05:47:14 -0700 Subject: [PATCH 56/64] use pytest for main() --- tests/test_model_builders.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index d6106ffc3c..7dca304887 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -111,6 +111,7 @@ def test_breast_cancer_without_intercept(self): self.assertTrue(np.allclose(pred_daal, pred_sklearn)) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls, base_score=0.5): @@ -142,7 +143,6 @@ def test_missing_value_support(self): xgboost_pred = self.xgb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -159,7 +159,6 @@ def test_model_predict_shap_contribs(self): ) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -176,7 +175,6 @@ def test_model_predict_shap_interactions(self): ) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-6) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -191,6 +189,7 @@ def test_model_predict_shap_contribs_missing_values(self): # duplicate all tests for bae_score=0.0 +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder_base_score0(XGBoostRegressionModelBuilder): @classmethod def setUpClass(cls): @@ -198,12 +197,14 @@ def setUpClass(cls): # duplicate all tests for bae_score=100 +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder_base_score100(XGBoostRegressionModelBuilder): @classmethod def setUpClass(cls): XGBoostRegressionModelBuilder.setUpClass(100) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls, base_score=0.5, n_classes=2, objective="binary:logistic"): @@ -257,14 +258,12 @@ def test_missing_value_support(self): xgboost_pred = self.xgb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-7) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): booster = self.xgb_model.get_booster() m = d4p.mb.convert_model(booster) @@ -273,6 +272,7 @@ def test_model_predict_shap_interactions(self): # duplicate all tests for bae_score=0.3 +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_base_score03(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): @@ -280,18 +280,21 @@ def setUpClass(cls): # duplicate all tests for bae_score=0.7 +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_base_score07(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(base_score=0.7) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_n_classes5(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(n_classes=5) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_n_classes5_base_score03( XGBoostClassificationModelBuilder ): @@ -300,6 +303,7 @@ def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(n_classes=5, base_score=0.3) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_objective_logitraw( XGBoostClassificationModelBuilder ): @@ -328,6 +332,7 @@ def test_model_predict_proba(self): np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class LightGBMRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -369,7 +374,6 @@ def test_missing_value_support(self): lgbm_pred = self.lgbm_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=5e-6) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test, pred_contribs=True) @@ -383,7 +387,6 @@ def test_model_predict_shap_contribs(self): np.testing.assert_allclose(d4p_pred[:, :-1], shap_pred, rtol=1e-6) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column @@ -396,7 +399,6 @@ def test_model_predict_shap_interactions(self): ) np.testing.assert_allclose(d4p_pred, shap_pred, rtol=1e-6) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_nan, pred_contribs=True) @@ -408,6 +410,7 @@ def test_model_predict_shap_contribs_missing_values(self): np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class LightGBMClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -453,25 +456,23 @@ def test_missing_value_support(self): lgbm_pred = np.argmax(self.lgbm_model.predict(self.X_nan), axis=1) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_interactions=True) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_nan, pred_contribs=True) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class LightGBMClassificationModelBuilder_binaryClassification(unittest.TestCase): @classmethod def setUpClass(cls): @@ -525,25 +526,23 @@ def test_model_predict_proba_missing_values(self): lgbm_pred = self.lgbm_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_contribs=True) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_test, pred_interactions=True) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs_missing_values(self): m = d4p.mb.convert_model(self.lgbm_model) with self.assertRaises(NotImplementedError): m.predict(self.X_nan, pred_contribs=True) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class CatBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -583,7 +582,6 @@ def test_missing_value_support(self): lgbm_pred = self.cb_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-7) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added with self.assertWarnsRegex( @@ -593,6 +591,7 @@ def test_model_predict_shap_contribs(self): d4p.mb.convert_model(self.cb_model) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class CatBoostClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -634,7 +633,6 @@ def test_missing_value_support(self): cb_pred = self.cb_model.predict(self.X_nan, prediction_type="Class").T[0] np.testing.assert_allclose(d4p_pred, cb_pred, rtol=1e-7) - @pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) def test_model_predict_shap_contribs(self): # SHAP value support from CatBoost models is to be added with self.assertWarnsRegex( @@ -644,6 +642,7 @@ def test_model_predict_shap_contribs(self): d4p.mb.convert_model(self.cb_model) +@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) class XGBoostEarlyStopping(unittest.TestCase): @classmethod def setUpClass(cls) -> None: @@ -760,4 +759,4 @@ def get_dump(self, *_, **kwargs): if __name__ == "__main__": - unittest.main() + pytest.main([__file__]) From dcba3afda5161494b477d415edc78d3644c1719e Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 07:28:28 -0700 Subject: [PATCH 57/64] fix: use unittest.skipIf --- tests/test_model_builders.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 7dca304887..905299f58a 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -19,7 +19,6 @@ import catboost as cb import lightgbm as lgbm import numpy as np -import pytest import shap import xgboost as xgb from sklearn.datasets import ( @@ -111,7 +110,7 @@ def test_breast_cancer_without_intercept(self): self.assertTrue(np.allclose(pred_daal, pred_sklearn)) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls, base_score=0.5): @@ -189,7 +188,7 @@ def test_model_predict_shap_contribs_missing_values(self): # duplicate all tests for bae_score=0.0 -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder_base_score0(XGBoostRegressionModelBuilder): @classmethod def setUpClass(cls): @@ -197,14 +196,14 @@ def setUpClass(cls): # duplicate all tests for bae_score=100 -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder_base_score100(XGBoostRegressionModelBuilder): @classmethod def setUpClass(cls): XGBoostRegressionModelBuilder.setUpClass(100) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls, base_score=0.5, n_classes=2, objective="binary:logistic"): @@ -272,7 +271,7 @@ def test_model_predict_shap_interactions(self): # duplicate all tests for bae_score=0.3 -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_base_score03(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): @@ -280,21 +279,21 @@ def setUpClass(cls): # duplicate all tests for bae_score=0.7 -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_base_score07(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(base_score=0.7) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_n_classes5(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(n_classes=5) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_n_classes5_base_score03( XGBoostClassificationModelBuilder ): @@ -303,7 +302,7 @@ def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(n_classes=5, base_score=0.3) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_objective_logitraw( XGBoostClassificationModelBuilder ): @@ -332,7 +331,7 @@ def test_model_predict_proba(self): np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class LightGBMRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -410,7 +409,7 @@ def test_model_predict_shap_contribs_missing_values(self): np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class LightGBMClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -472,7 +471,7 @@ def test_model_predict_shap_contribs_missing_values(self): m.predict(self.X_nan, pred_contribs=True) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class LightGBMClassificationModelBuilder_binaryClassification(unittest.TestCase): @classmethod def setUpClass(cls): @@ -542,7 +541,7 @@ def test_model_predict_shap_contribs_missing_values(self): m.predict(self.X_nan, pred_contribs=True) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class CatBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -591,7 +590,7 @@ def test_model_predict_shap_contribs(self): d4p.mb.convert_model(self.cb_model) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class CatBoostClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -642,7 +641,7 @@ def test_model_predict_shap_contribs(self): d4p.mb.convert_model(self.cb_model) -@pytest.mark.skipif(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) class XGBoostEarlyStopping(unittest.TestCase): @classmethod def setUpClass(cls) -> None: @@ -759,4 +758,4 @@ def get_dump(self, *_, **kwargs): if __name__ == "__main__": - pytest.main([__file__]) + unittest.main() From c8b2a69f7d3d530151f4dd23e462f556f3054202 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 25 Oct 2023 08:17:13 -0700 Subject: [PATCH 58/64] fix: typo 2023 -> 2024 --- tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index 19ea78f7a6..c509416148 100755 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -276,7 +276,7 @@ def test_svm(self): "model_builders_xgboost_shap", None, None, - (2023, "P", 1), + (2024, "P", 1), ["xgboost"], ), ("model_builders_catboost", None, None, (2021, "P", 4), ["catboost"]), From 8bc6d7ca2aff3865d633f65075e7b65b438ce33e Mon Sep 17 00:00:00 2001 From: Andreas Huber <9201869+ahuber21@users.noreply.github.com> Date: Thu, 26 Oct 2023 14:01:25 +0200 Subject: [PATCH 59/64] Drop 3.12 requirement Co-authored-by: Nikolay Petrov --- requirements-test.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 8b2c436e98..6a5318a058 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -10,5 +10,4 @@ xgboost==1.7.6; python_version <= '3.9' xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 catboost==1.2.2; python_version <= '3.11' -catboost>=1.2.2; python_version >= '3.12' shap==0.42.1 From cfe06074517879f76b26c9746c009131709986fd Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 26 Oct 2023 05:04:13 -0700 Subject: [PATCH 60/64] cleanup after rebase --- .ci/pipeline/build-and-test-lnx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/pipeline/build-and-test-lnx.yml b/.ci/pipeline/build-and-test-lnx.yml index fb03f51f53..f62c827c9e 100644 --- a/.ci/pipeline/build-and-test-lnx.yml +++ b/.ci/pipeline/build-and-test-lnx.yml @@ -45,7 +45,7 @@ steps: . /usr/share/miniconda/etc/profile.d/conda.sh conda activate CB bash .ci/scripts/setup_sklearn.sh $(SKLEARN_VERSION) - pip install --upgrade -r requirements-test.txt -r requirements-test-optional.txt + pip install --upgrade -r requirements-test.txt pip install $(python .ci/scripts/get_compatible_scipy_version.py) if [ $(echo $(PYTHON_VERSION) | grep '3.8\|3.9\|3.10') ]; then conda install -q -y -c intel dpnp; fi pip list From 701a2ff8a256ec39b87bb761f6415db9be2872df Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 26 Oct 2023 07:57:10 -0700 Subject: [PATCH 61/64] Skip SHAP install & tests on 3.12 --- requirements-test.txt | 2 +- tests/test_model_builders.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 6a5318a058..dedcb637c8 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -10,4 +10,4 @@ xgboost==1.7.6; python_version <= '3.9' xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 catboost==1.2.2; python_version <= '3.11' -shap==0.42.1 +shap==0.42.1; python_version < '3.12' # FIXME: some dependencies do not yet support 3.12 diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 905299f58a..034239dad3 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -19,7 +19,13 @@ import catboost as cb import lightgbm as lgbm import numpy as np -import shap + +try: + import shap + + shap_available = True +except ImportError: + shap_available = False import xgboost as xgb from sklearn.datasets import ( load_breast_cancer, @@ -38,6 +44,7 @@ shap_not_supported_str = ( f"SHAP value calculation only supported for version {shap_required_version} or later" ) +shap_unavailable_str = f"SHAP Python package not available" class LogRegModelBuilder(unittest.TestCase): @@ -373,6 +380,7 @@ def test_missing_value_support(self): lgbm_pred = self.lgbm_model.predict(self.X_nan) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=5e-6) + @unittest.skipUnless(shap_available, reason=shap_unavailable_str) def test_model_predict_shap_contribs(self): m = d4p.mb.convert_model(self.lgbm_model) d4p_pred = m.predict(self.X_test, pred_contribs=True) @@ -386,6 +394,7 @@ def test_model_predict_shap_contribs(self): np.testing.assert_allclose(d4p_pred[:, :-1], shap_pred, rtol=1e-6) np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) + @unittest.skipUnless(shap_available, reason=shap_unavailable_str) def test_model_predict_shap_interactions(self): m = d4p.mb.convert_model(self.lgbm_model) # SHAP Python package drops bias terms from the returned matrix, therefore we drop the final row & column From 2a94bedaaf40d5863c0a6454e734b8a6e209aed7 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 26 Oct 2023 08:33:25 -0700 Subject: [PATCH 62/64] Install catboost on all python versions --- requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index dedcb637c8..d241e78767 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,5 +9,5 @@ pandas==2.1.1 ; python_version >= '3.9' xgboost==1.7.6; python_version <= '3.9' xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 -catboost==1.2.2; python_version <= '3.11' +catboost==1.2.2 shap==0.42.1; python_version < '3.12' # FIXME: some dependencies do not yet support 3.12 From 1142630ed7540146ad1b1d99e1dcafdd1ecdb744 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 26 Oct 2023 08:54:03 -0700 Subject: [PATCH 63/64] Skip catboost install & tests on 3.12 --- requirements-test.txt | 4 +-- tests/test_model_builders.py | 60 +++++++++++++++++++++--------------- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index d241e78767..48cd49bf91 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,5 +9,5 @@ pandas==2.1.1 ; python_version >= '3.9' xgboost==1.7.6; python_version <= '3.9' xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 -catboost==1.2.2 -shap==0.42.1; python_version < '3.12' # FIXME: some dependencies do not yet support 3.12 +catboost==1.2.2; python_version <= '3.11' +shap==0.42.1; python_version <= '3.11' diff --git a/tests/test_model_builders.py b/tests/test_model_builders.py index 034239dad3..93b39fd77e 100644 --- a/tests/test_model_builders.py +++ b/tests/test_model_builders.py @@ -16,16 +16,8 @@ import unittest -import catboost as cb import lightgbm as lgbm import numpy as np - -try: - import shap - - shap_available = True -except ImportError: - shap_available = False import xgboost as xgb from sklearn.datasets import ( load_breast_cancer, @@ -39,12 +31,28 @@ import daal4py as d4p from daal4py.sklearn._utils import daal_check_version +try: + import catboost as cb + + cb_available = True +except ImportError: + cb_available = False + +try: + import shap + + shap_available = True +except ImportError: + shap_available = False + + shap_required_version = (2024, "P", 1) -shap_not_supported = not daal_check_version(shap_required_version) +shap_supported = daal_check_version(shap_required_version) shap_not_supported_str = ( f"SHAP value calculation only supported for version {shap_required_version} or later" ) -shap_unavailable_str = f"SHAP Python package not available" +shap_unavailable_str = "SHAP Python package not available" +cb_unavailable_str = "CatBoost not available" class LogRegModelBuilder(unittest.TestCase): @@ -117,7 +125,7 @@ def test_breast_cancer_without_intercept(self): self.assertTrue(np.allclose(pred_daal, pred_sklearn)) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls, base_score=0.5): @@ -195,7 +203,7 @@ def test_model_predict_shap_contribs_missing_values(self): # duplicate all tests for bae_score=0.0 -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder_base_score0(XGBoostRegressionModelBuilder): @classmethod def setUpClass(cls): @@ -203,14 +211,14 @@ def setUpClass(cls): # duplicate all tests for bae_score=100 -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostRegressionModelBuilder_base_score100(XGBoostRegressionModelBuilder): @classmethod def setUpClass(cls): XGBoostRegressionModelBuilder.setUpClass(100) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls, base_score=0.5, n_classes=2, objective="binary:logistic"): @@ -278,7 +286,7 @@ def test_model_predict_shap_interactions(self): # duplicate all tests for bae_score=0.3 -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_base_score03(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): @@ -286,21 +294,21 @@ def setUpClass(cls): # duplicate all tests for bae_score=0.7 -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_base_score07(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(base_score=0.7) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_n_classes5(XGBoostClassificationModelBuilder): @classmethod def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(n_classes=5) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_n_classes5_base_score03( XGBoostClassificationModelBuilder ): @@ -309,7 +317,7 @@ def setUpClass(cls): XGBoostClassificationModelBuilder.setUpClass(n_classes=5, base_score=0.3) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostClassificationModelBuilder_objective_logitraw( XGBoostClassificationModelBuilder ): @@ -338,7 +346,7 @@ def test_model_predict_proba(self): np.testing.assert_allclose(d4p_pred, xgboost_pred, rtol=1e-5) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class LightGBMRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -418,7 +426,7 @@ def test_model_predict_shap_contribs_missing_values(self): np.testing.assert_allclose(d4p_pred, lgbm_pred, rtol=1e-6) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class LightGBMClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -480,7 +488,7 @@ def test_model_predict_shap_contribs_missing_values(self): m.predict(self.X_nan, pred_contribs=True) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class LightGBMClassificationModelBuilder_binaryClassification(unittest.TestCase): @classmethod def setUpClass(cls): @@ -550,7 +558,8 @@ def test_model_predict_shap_contribs_missing_values(self): m.predict(self.X_nan, pred_contribs=True) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +@unittest.skipUnless(cb_available, reason=cb_unavailable_str) class CatBoostRegressionModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -599,7 +608,8 @@ def test_model_predict_shap_contribs(self): d4p.mb.convert_model(self.cb_model) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) +@unittest.skipUnless(cb_available, reason=cb_unavailable_str) class CatBoostClassificationModelBuilder(unittest.TestCase): @classmethod def setUpClass(cls): @@ -650,7 +660,7 @@ def test_model_predict_shap_contribs(self): d4p.mb.convert_model(self.cb_model) -@unittest.skipIf(shap_not_supported, reason=shap_not_supported_str) +@unittest.skipUnless(shap_supported, reason=shap_not_supported_str) class XGBoostEarlyStopping(unittest.TestCase): @classmethod def setUpClass(cls) -> None: From 4f490ea5cde48c4a7152ba3a4096be3b1b139a70 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Fri, 27 Oct 2023 01:30:36 -0700 Subject: [PATCH 64/64] chore: add fixmes for catboost and shap support on 3.12 --- requirements-test.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 48cd49bf91..fc9c0ad4eb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -9,5 +9,5 @@ pandas==2.1.1 ; python_version >= '3.9' xgboost==1.7.6; python_version <= '3.9' xgboost==2.0.0; python_version >= '3.10' lightgbm==4.1.0 -catboost==1.2.2; python_version <= '3.11' -shap==0.42.1; python_version <= '3.11' +catboost==1.2.2; python_version <= '3.11' # FIXME: Add as soon as 3.12 is supported +shap==0.42.1; python_version <= '3.11' # FIXME: Add as soon as 3.12 is supported