From d1f4ccec0b13fc7ca18bba5bd302c6cca808bb2a Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 14 Jun 2023 14:05:07 +0200 Subject: [PATCH 01/52] dense -> hist regressor algo --- onedal/ensemble/forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index f300d1785f..e4fb7c6eba 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -533,7 +533,7 @@ def __init__(self, voting_mode='weighted', error_metric_mode='none', variable_importance_mode='none', - algorithm='dense', + algorithm='hist', **kwargs): super().__init__( n_estimators=n_estimators, From 7f71f50d81f51ab96e31322ba76590b44f4e78ee Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 01:31:19 -0700 Subject: [PATCH 02/52] movement of numpy away from tree_state --- onedal/primitives/tree_visitor.cpp | 107 +++++++++++++++-------------- 1 file changed, 55 insertions(+), 52 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 0f147e1e8e..d4e76f9afc 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -75,14 +75,45 @@ class skl_tree_node { // We only expose the minimum information to python template struct tree_state { - py::array_t node_ar; - py::array_t value_ar; + skl_tree_node * node_ar; + double * value_ar; std::size_t max_depth; std::size_t node_count; std::size_t leaf_count; std::size_t class_count; }; +template +class tree_state_py { +public: + py::array node_ar; + py::array value_ar; + std::size_t max_depth; + std::size_t node_count; + std::size_t leaf_count; + std::size_t class_count; + + tree_state_py(tree_state inp){ + this->max_depth = inp.max_depth; + this->node_count = inp.node_count; + this->leaf_count = inp.leaf_count; + this->class_count = inp.class_count; + + auto node_ar_shape = py::array::ShapeContainer({ this->node_count }); + auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) }); + + auto value_ar_shape = py::array::ShapeContainer({ static_cast(this->node_count), + 1, + static_cast(this->class_count) }); + auto value_ar_strides = py::array::StridesContainer( + { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) }); + + this->node_ar = py::array_t(node_ar_shape, node_ar_strides, inp.node_ar); + this->value_ar = py::array_t(value_ar_shape, value_ar_strides, inp.value_ar); + } +}; + + // Declaration and implementation. template class node_count_visitor { @@ -153,52 +184,32 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->max_depth = _depth; this->leaf_count = _n_leafs; this->class_count = _max_n_classes; - - auto node_ar_shape = py::array::ShapeContainer({ this->node_count }); - auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) }); - - auto value_ar_shape = py::array::ShapeContainer({ static_cast(this->node_count), - 1, - static_cast(this->class_count) }); - auto value_ar_strides = py::array::StridesContainer( - { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) }); - - skl_tree_node* node_ar_ptr = new skl_tree_node[this->node_count]; - - OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); - double* value_ar_ptr = - new double[this->node_count * 1 * - this->class_count](); // oneDAL only supports scalar responses for now + this->node_ar = new skl_tree_node[node_count]; + this->value_ar = new double[node_count*1*class_count](); // oneDAL only supports scalar responses for now - this->node_ar = py::array_t(node_ar_shape, node_ar_strides, node_ar_ptr); - this->value_ar = py::array_t(value_ar_shape, value_ar_strides, value_ar_ptr); } template bool to_sklearn_tree_object_visitor::call(const df::split_node_info& info) { - py::buffer_info node_ar_buf = this->node_ar.request(); - - skl_tree_node* node_ar_ptr = static_cast(node_ar_buf.ptr); - if (info.get_level() > 0) { // has parents Py_ssize_t parent = parents[info.get_level() - 1]; if (node_ar_ptr[parent].left_child > 0) { - assert(node_ar_ptr[node_id].right_child < 0); - node_ar_ptr[parent].right_child = node_id; + assert(node_ar[node_id].right_child < 0); + node_ar[parent].right_child = node_id; } else { - node_ar_ptr[parent].left_child = node_id; + node_ar[parent].left_child = node_id; } } parents[info.get_level()] = node_id; - node_ar_ptr[node_id].feature = info.get_feature_index(); - node_ar_ptr[node_id].threshold = info.get_feature_value(); - node_ar_ptr[node_id].impurity = info.get_impurity(); - node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].missing_go_to_left = false; + node_ar[node_id].feature = info.get_feature_index(); + node_ar[node_id].threshold = info.get_feature_value(); + node_ar[node_id].impurity = info.get_impurity(); + node_ar[node_id].n_node_samples = info.get_sample_count(); + node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + node_ar[node_id].missing_go_to_left = false; // wrap-up ++node_id; @@ -208,25 +219,22 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& // stuff that is done for all leaf node types template void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info& info) { - py::buffer_info node_ar_buf = this->node_ar.request(); - - skl_tree_node* node_ar_ptr = static_cast(node_ar_buf.ptr); if (info.get_level()) { Py_ssize_t parent = parents[info.get_level() - 1]; - if (node_ar_ptr[parent].left_child > 0) { - assert(node_ar_ptr[node_id].right_child < 0); - node_ar_ptr[parent].right_child = node_id; + if (node_ar[parent].left_child > 0) { + assert(node_ar[node_id].right_child < 0); + node_ar[parent].right_child = node_id; } else { - node_ar_ptr[parent].left_child = node_id; + node_ar[parent].left_child = node_id; } } - node_ar_ptr[node_id].impurity = info.get_impurity(); - node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].missing_go_to_left = false; + node_ar[node_id].impurity = info.get_impurity(); + node_ar[node_id].n_node_samples = info.get_sample_count(); + node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + node_ar[node_id].missing_go_to_left = false; } template <> @@ -235,10 +243,7 @@ bool to_sklearn_tree_object_visitor::call( _onLeafNode(info); OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count); - py::buffer_info value_ar_buf = this->value_ar.request(); - double* value_ar_ptr = static_cast(value_ar_buf.ptr); - - value_ar_ptr[node_id * 1 * this->class_count] = info.get_response(); + value_ar[node_id * 1 * this->class_count] = info.get_response(); // wrap-up ++node_id; @@ -248,8 +253,6 @@ bool to_sklearn_tree_object_visitor::call( template <> bool to_sklearn_tree_object_visitor::call( const df::leaf_node_info& info) { - py::buffer_info value_ar_buf = this->value_ar.request(); - double* value_ar_ptr = static_cast(value_ar_buf.ptr); if (info.get_level() > 0) { std::size_t depth = static_cast(info.get_level()) - 1; @@ -258,7 +261,7 @@ bool to_sklearn_tree_object_visitor::call( OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, id, this->class_count); const auto row = id * 1 * this->class_count; OVERFLOW_CHECK_BY_ADDING(std::size_t, row, info.get_response()); - value_ar_ptr[row + info.get_response()] += info.get_sample_count(); + value_ar[row + info.get_response()] += info.get_sample_count(); if (depth == 0) { break; } @@ -278,7 +281,7 @@ template void init_get_tree_state(py::module_& m) { using namespace decision_forest; using model_t = model; - using tree_state_t = tree_state; + using tree_state_t = tree_state_py; // TODO: // create one instance for cls and reg. From 88c9cdc18fb0b98798f8d2c25c6fd99b79f664fd Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 01:45:25 -0700 Subject: [PATCH 03/52] this-> overall --- onedal/primitives/tree_visitor.cpp | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index d4e76f9afc..6d8069cc7e 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -86,8 +86,8 @@ struct tree_state { template class tree_state_py { public: - py::array node_ar; - py::array value_ar; + py::array_t node_ar; + py::array_t value_ar; std::size_t max_depth; std::size_t node_count; std::size_t leaf_count; @@ -185,8 +185,8 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->leaf_count = _n_leafs; this->class_count = _max_n_classes; OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); - this->node_ar = new skl_tree_node[node_count]; - this->value_ar = new double[node_count*1*class_count](); // oneDAL only supports scalar responses for now + this->node_ar = new skl_tree_node[this->node_count]; + this->value_ar = new double[this->node_count*1*this->class_count](); // oneDAL only supports scalar responses for now } @@ -195,21 +195,21 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& if (info.get_level() > 0) { // has parents Py_ssize_t parent = parents[info.get_level() - 1]; - if (node_ar_ptr[parent].left_child > 0) { - assert(node_ar[node_id].right_child < 0); - node_ar[parent].right_child = node_id; + if (this->node_ar[parent].left_child > 0) { + assert(this->node_ar[node_id].right_child < 0); + this->node_ar[parent].right_child = node_id; } else { - node_ar[parent].left_child = node_id; + this->node_ar[parent].left_child = node_id; } } parents[info.get_level()] = node_id; - node_ar[node_id].feature = info.get_feature_index(); - node_ar[node_id].threshold = info.get_feature_value(); - node_ar[node_id].impurity = info.get_impurity(); - node_ar[node_id].n_node_samples = info.get_sample_count(); - node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar[node_id].missing_go_to_left = false; + this->node_ar[node_id].feature = info.get_feature_index(); + this->node_ar[node_id].threshold = info.get_feature_value(); + this->node_ar[node_id].impurity = info.get_impurity(); + this->node_ar[node_id].n_node_samples = info.get_sample_count(); + this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar[node_id].missing_go_to_left = false; // wrap-up ++node_id; @@ -222,19 +222,19 @@ void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info< if (info.get_level()) { Py_ssize_t parent = parents[info.get_level() - 1]; - if (node_ar[parent].left_child > 0) { - assert(node_ar[node_id].right_child < 0); - node_ar[parent].right_child = node_id; + if (this->node_ar[parent].left_child > 0) { + assert(this->node_ar[node_id].right_child < 0); + this->node_ar[parent].right_child = node_id; } else { - node_ar[parent].left_child = node_id; + this->node_ar[parent].left_child = node_id; } } - node_ar[node_id].impurity = info.get_impurity(); - node_ar[node_id].n_node_samples = info.get_sample_count(); - node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar[node_id].missing_go_to_left = false; + this->node_ar[node_id].impurity = info.get_impurity(); + this->node_ar[node_id].n_node_samples = info.get_sample_count(); + this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar[node_id].missing_go_to_left = false; } template <> @@ -243,7 +243,7 @@ bool to_sklearn_tree_object_visitor::call( _onLeafNode(info); OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count); - value_ar[node_id * 1 * this->class_count] = info.get_response(); + this->value_ar[node_id * 1 * this->class_count] = info.get_response(); // wrap-up ++node_id; @@ -261,7 +261,7 @@ bool to_sklearn_tree_object_visitor::call( OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, id, this->class_count); const auto row = id * 1 * this->class_count; OVERFLOW_CHECK_BY_ADDING(std::size_t, row, info.get_response()); - value_ar[row + info.get_response()] += info.get_sample_count(); + this->value_ar[row + info.get_response()] += info.get_sample_count(); if (depth == 0) { break; } @@ -270,7 +270,7 @@ bool to_sklearn_tree_object_visitor::call( } _onLeafNode(info); OVERFLOW_CHECK_BY_ADDING(std::size_t, node_id * 1 * this->class_count, info.get_response()); - value_ar_ptr[node_id * 1 * this->class_count + info.get_response()] += info.get_sample_count(); + this->value_ar[node_id * 1 * this->class_count + info.get_response()] += info.get_sample_count(); // wrap-up ++node_id; From f6be29c9fd5dbeb0b5db7b9e167e1553b5f0dfd1 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 01:48:09 -0700 Subject: [PATCH 04/52] forgotten template --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 6d8069cc7e..a1fcedb137 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -93,7 +93,7 @@ class tree_state_py { std::size_t leaf_count; std::size_t class_count; - tree_state_py(tree_state inp){ + tree_state_py(tree_state inp){ this->max_depth = inp.max_depth; this->node_count = inp.node_count; this->leaf_count = inp.leaf_count; From 122515fca422c4102cec01e0d68fc318a7308dbc Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 02:50:54 -0700 Subject: [PATCH 05/52] change sample_weight checks --- onedal/ensemble/forest.py | 57 ++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index e4fb7c6eba..63a8cee3eb 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -305,40 +305,41 @@ def _validate_targets(self, y, dtype): self.classes_ = None return _column_or_1d(y, warn=True).astype(dtype, copy=False) - def _get_sample_weight(self, X, y, sample_weight): + def _get_sample_weight(self, sample_weight, X): n_samples = X.shape[0] dtype = X.dtype - if n_samples == 1: - raise ValueError("n_samples=1") + #if n_samples == 1: + # raise ValueError("n_samples=1") - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=dtype) - sample_weight = sample_weight.ravel() + #sample_weight = np.asarray([] + # if sample_weight is None + # else sample_weight, dtype=dtype) + sample_weight = np.asarray(sample_weight, dtype=dtype).ravel() - sample_weight_count = sample_weight.shape[0] - if sample_weight_count != 0 and sample_weight_count != n_samples: + if sample_weight.size != X.shape[0]: raise ValueError("sample_weight and X have incompatible shapes: " "%r vs %r\n" "Note: Sparse matrices cannot be indexed w/" "boolean masks (use `indices=True` in CV)." - % (len(sample_weight), X.shape)) + % (sample_weight.shape, X.shape)) - if sample_weight_count == 0: - sample_weight = np.ones(n_samples, dtype=dtype) - elif isinstance(sample_weight, Number): - sample_weight = np.full(n_samples, sample_weight, dtype=dtype) - else: + + #if sample_weight_count == 0: + # sample_weight = np.ones(n_samples, dtype=dtype) + #elif isinstance(sample_weight, Number): + # sample_weight = np.full(n_samples, sample_weight, dtype=dtype) + #else: + if True: sample_weight = _check_array( sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, order="C" ) - if sample_weight.ndim != 1: - raise ValueError("Sample weights must be 1D array or scalar") + #if sample_weight.ndim != 1: + # raise ValueError("Sample weights must be 1D array or scalar") - if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + #if sample_weight.shape != (n_samples,): + # raise ValueError("sample_weight.shape == {}, expected {}!" + # .format(sample_weight.shape, (n_samples,))) return sample_weight def _get_policy(self, queue, *data): @@ -349,17 +350,25 @@ def _fit(self, X, y, sample_weight, module, queue): X, y, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse='csr') y = self._validate_targets(y, X.dtype) - sample_weight = self._get_sample_weight(X, y, sample_weight) self.n_features_in_ = X.shape[1] if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ - policy = self._get_policy(queue, X, y, sample_weight) - X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight) + data = [X, y] + + if(sample_weight is not None and len(sample_weight > 0)): + sample_weight = self._get_sample_weight(sample_weight, X) + data.append(sample_weight) + + + policy = self._get_policy(queue, *data) + + #pass as *data + data = _convert_to_supported(policy, *data) params = self._get_onedal_params(X) train_result = module.train( - policy, params, *to_table(X, y, sample_weight)) + policy, params, *to_table(*data)) self._onedal_model = train_result.model if self.oob_score: From 60cebf192e770ac1dcb3f425511741b34d99f55d Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:12:16 -0700 Subject: [PATCH 06/52] remove forced sample_weight use --- onedal/ensemble/forest.cpp | 11 +++++++++++ onedal/ensemble/forest.py | 27 +++++---------------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/onedal/ensemble/forest.cpp b/onedal/ensemble/forest.cpp index a79f6b26cd..86ac3610d7 100644 --- a/onedal/ensemble/forest.cpp +++ b/onedal/ensemble/forest.cpp @@ -216,6 +216,17 @@ void init_train_ops(py::module_& m) { train_ops ops(policy, input_t{ data, responses, weights }, params2desc{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); + m.def("train", + [](const Policy& policy, + const py::dict& params, + const table& data, + const table& responses) { + using namespace decision_forest; + using input_t = train_input; + + train_ops ops(policy, input_t{ data, responses}, params2desc{}); + return fptype2t{ method2t{ Task{}, ops } }(params); + }); } template diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 63a8cee3eb..2047e80b9c 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -308,12 +308,7 @@ def _validate_targets(self, y, dtype): def _get_sample_weight(self, sample_weight, X): n_samples = X.shape[0] dtype = X.dtype - #if n_samples == 1: - # raise ValueError("n_samples=1") - #sample_weight = np.asarray([] - # if sample_weight is None - # else sample_weight, dtype=dtype) sample_weight = np.asarray(sample_weight, dtype=dtype).ravel() if sample_weight.size != X.shape[0]: @@ -323,23 +318,11 @@ def _get_sample_weight(self, sample_weight, X): "boolean masks (use `indices=True` in CV)." % (sample_weight.shape, X.shape)) + sample_weight = _check_array( + sample_weight, accept_sparse=False, ensure_2d=False, + dtype=dtype, order="C" + ) - #if sample_weight_count == 0: - # sample_weight = np.ones(n_samples, dtype=dtype) - #elif isinstance(sample_weight, Number): - # sample_weight = np.full(n_samples, sample_weight, dtype=dtype) - #else: - if True: - sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" - ) - #if sample_weight.ndim != 1: - # raise ValueError("Sample weights must be 1D array or scalar") - - #if sample_weight.shape != (n_samples,): - # raise ValueError("sample_weight.shape == {}, expected {}!" - # .format(sample_weight.shape, (n_samples,))) return sample_weight def _get_policy(self, queue, *data): @@ -357,7 +340,7 @@ def _fit(self, X, y, sample_weight, module, queue): data = [X, y] - if(sample_weight is not None and len(sample_weight > 0)): + if(sample_weight is not None and len(sample_weight) > 0): sample_weight = self._get_sample_weight(sample_weight, X) data.append(sample_weight) From 47fdfecf4bbbd3ff9ef21ce51c5a3072adf54f1c Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:29:20 -0700 Subject: [PATCH 07/52] PEP8 compliance --- onedal/ensemble/forest.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 2047e80b9c..ca532bd0a5 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -306,10 +306,7 @@ def _validate_targets(self, y, dtype): return _column_or_1d(y, warn=True).astype(dtype, copy=False) def _get_sample_weight(self, sample_weight, X): - n_samples = X.shape[0] - dtype = X.dtype - - sample_weight = np.asarray(sample_weight, dtype=dtype).ravel() + sample_weight = np.asarray(sample_weight, dtype=X.dtype).ravel() if sample_weight.size != X.shape[0]: raise ValueError("sample_weight and X have incompatible shapes: " @@ -340,11 +337,10 @@ def _fit(self, X, y, sample_weight, module, queue): data = [X, y] - if(sample_weight is not None and len(sample_weight) > 0): + if sample_weight is not None and len(sample_weight) > 0: sample_weight = self._get_sample_weight(sample_weight, X) data.append(sample_weight) - policy = self._get_policy(queue, *data) #pass as *data From 47f84150233e930be49394868260892253458b27 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:30:39 -0700 Subject: [PATCH 08/52] codefactor-io compliance --- onedal/primitives/tree_visitor.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index a1fcedb137..88ff3a8a84 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -187,7 +187,6 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); this->node_ar = new skl_tree_node[this->node_count]; this->value_ar = new double[this->node_count*1*this->class_count](); // oneDAL only supports scalar responses for now - } template @@ -219,7 +218,6 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& // stuff that is done for all leaf node types template void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info& info) { - if (info.get_level()) { Py_ssize_t parent = parents[info.get_level() - 1]; if (this->node_ar[parent].left_child > 0) { From 4c148eb77713f1303a2c073d01c6cb196d421124 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:31:55 -0700 Subject: [PATCH 09/52] PEP8 compliance --- onedal/ensemble/forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index ca532bd0a5..2ae988de07 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -317,7 +317,7 @@ def _get_sample_weight(self, sample_weight, X): sample_weight = _check_array( sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" + dtype=X.dtype, order="C" ) return sample_weight From 0faf7eb891884905f082e92566b76829d88329c6 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 20 Jun 2023 11:06:46 +0200 Subject: [PATCH 10/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: KulikovNikita --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 88ff3a8a84..d8a039f66e 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -241,7 +241,7 @@ bool to_sklearn_tree_object_visitor::call( _onLeafNode(info); OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count); - this->value_ar[node_id * 1 * this->class_count] = info.get_response(); + this->value_ar[node_id * this->class_count] = info.get_response(); // wrap-up ++node_id; From 4c31e8beab89265ea7ca5543e66c110fcde1eac2 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 20 Jun 2023 11:06:53 +0200 Subject: [PATCH 11/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: KulikovNikita --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index d8a039f66e..7706159d51 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -186,7 +186,7 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->class_count = _max_n_classes; OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); this->node_ar = new skl_tree_node[this->node_count]; - this->value_ar = new double[this->node_count*1*this->class_count](); // oneDAL only supports scalar responses for now + this->value_ar = new double[this->node_count * this->class_count](); // oneDAL only supports scalar responses for now } template From b3437384fae83ec0a565e22ab9e3e19c119b7131 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 20 Jun 2023 11:06:59 +0200 Subject: [PATCH 12/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: KulikovNikita --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 7706159d51..c99aad47e4 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -268,7 +268,7 @@ bool to_sklearn_tree_object_visitor::call( } _onLeafNode(info); OVERFLOW_CHECK_BY_ADDING(std::size_t, node_id * 1 * this->class_count, info.get_response()); - this->value_ar[node_id * 1 * this->class_count + info.get_response()] += info.get_sample_count(); + this->value_ar[node_id * this->class_count + info.get_response()] += info.get_sample_count(); // wrap-up ++node_id; From cf21916c657991faef7a7118ca0322728088b905 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 14 Jun 2023 14:05:07 +0200 Subject: [PATCH 13/52] dense -> hist regressor algo --- onedal/ensemble/forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index f300d1785f..e4fb7c6eba 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -533,7 +533,7 @@ def __init__(self, voting_mode='weighted', error_metric_mode='none', variable_importance_mode='none', - algorithm='dense', + algorithm='hist', **kwargs): super().__init__( n_estimators=n_estimators, From 44dea37739356b90bbd729c05a39868fd0591dbe Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 01:31:19 -0700 Subject: [PATCH 14/52] movement of numpy away from tree_state --- onedal/primitives/tree_visitor.cpp | 107 +++++++++++++++-------------- 1 file changed, 55 insertions(+), 52 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 0f147e1e8e..d4e76f9afc 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -75,14 +75,45 @@ class skl_tree_node { // We only expose the minimum information to python template struct tree_state { - py::array_t node_ar; - py::array_t value_ar; + skl_tree_node * node_ar; + double * value_ar; std::size_t max_depth; std::size_t node_count; std::size_t leaf_count; std::size_t class_count; }; +template +class tree_state_py { +public: + py::array node_ar; + py::array value_ar; + std::size_t max_depth; + std::size_t node_count; + std::size_t leaf_count; + std::size_t class_count; + + tree_state_py(tree_state inp){ + this->max_depth = inp.max_depth; + this->node_count = inp.node_count; + this->leaf_count = inp.leaf_count; + this->class_count = inp.class_count; + + auto node_ar_shape = py::array::ShapeContainer({ this->node_count }); + auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) }); + + auto value_ar_shape = py::array::ShapeContainer({ static_cast(this->node_count), + 1, + static_cast(this->class_count) }); + auto value_ar_strides = py::array::StridesContainer( + { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) }); + + this->node_ar = py::array_t(node_ar_shape, node_ar_strides, inp.node_ar); + this->value_ar = py::array_t(value_ar_shape, value_ar_strides, inp.value_ar); + } +}; + + // Declaration and implementation. template class node_count_visitor { @@ -153,52 +184,32 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->max_depth = _depth; this->leaf_count = _n_leafs; this->class_count = _max_n_classes; - - auto node_ar_shape = py::array::ShapeContainer({ this->node_count }); - auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) }); - - auto value_ar_shape = py::array::ShapeContainer({ static_cast(this->node_count), - 1, - static_cast(this->class_count) }); - auto value_ar_strides = py::array::StridesContainer( - { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) }); - - skl_tree_node* node_ar_ptr = new skl_tree_node[this->node_count]; - - OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); - double* value_ar_ptr = - new double[this->node_count * 1 * - this->class_count](); // oneDAL only supports scalar responses for now + this->node_ar = new skl_tree_node[node_count]; + this->value_ar = new double[node_count*1*class_count](); // oneDAL only supports scalar responses for now - this->node_ar = py::array_t(node_ar_shape, node_ar_strides, node_ar_ptr); - this->value_ar = py::array_t(value_ar_shape, value_ar_strides, value_ar_ptr); } template bool to_sklearn_tree_object_visitor::call(const df::split_node_info& info) { - py::buffer_info node_ar_buf = this->node_ar.request(); - - skl_tree_node* node_ar_ptr = static_cast(node_ar_buf.ptr); - if (info.get_level() > 0) { // has parents Py_ssize_t parent = parents[info.get_level() - 1]; if (node_ar_ptr[parent].left_child > 0) { - assert(node_ar_ptr[node_id].right_child < 0); - node_ar_ptr[parent].right_child = node_id; + assert(node_ar[node_id].right_child < 0); + node_ar[parent].right_child = node_id; } else { - node_ar_ptr[parent].left_child = node_id; + node_ar[parent].left_child = node_id; } } parents[info.get_level()] = node_id; - node_ar_ptr[node_id].feature = info.get_feature_index(); - node_ar_ptr[node_id].threshold = info.get_feature_value(); - node_ar_ptr[node_id].impurity = info.get_impurity(); - node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].missing_go_to_left = false; + node_ar[node_id].feature = info.get_feature_index(); + node_ar[node_id].threshold = info.get_feature_value(); + node_ar[node_id].impurity = info.get_impurity(); + node_ar[node_id].n_node_samples = info.get_sample_count(); + node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + node_ar[node_id].missing_go_to_left = false; // wrap-up ++node_id; @@ -208,25 +219,22 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& // stuff that is done for all leaf node types template void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info& info) { - py::buffer_info node_ar_buf = this->node_ar.request(); - - skl_tree_node* node_ar_ptr = static_cast(node_ar_buf.ptr); if (info.get_level()) { Py_ssize_t parent = parents[info.get_level() - 1]; - if (node_ar_ptr[parent].left_child > 0) { - assert(node_ar_ptr[node_id].right_child < 0); - node_ar_ptr[parent].right_child = node_id; + if (node_ar[parent].left_child > 0) { + assert(node_ar[node_id].right_child < 0); + node_ar[parent].right_child = node_id; } else { - node_ar_ptr[parent].left_child = node_id; + node_ar[parent].left_child = node_id; } } - node_ar_ptr[node_id].impurity = info.get_impurity(); - node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar_ptr[node_id].missing_go_to_left = false; + node_ar[node_id].impurity = info.get_impurity(); + node_ar[node_id].n_node_samples = info.get_sample_count(); + node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + node_ar[node_id].missing_go_to_left = false; } template <> @@ -235,10 +243,7 @@ bool to_sklearn_tree_object_visitor::call( _onLeafNode(info); OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count); - py::buffer_info value_ar_buf = this->value_ar.request(); - double* value_ar_ptr = static_cast(value_ar_buf.ptr); - - value_ar_ptr[node_id * 1 * this->class_count] = info.get_response(); + value_ar[node_id * 1 * this->class_count] = info.get_response(); // wrap-up ++node_id; @@ -248,8 +253,6 @@ bool to_sklearn_tree_object_visitor::call( template <> bool to_sklearn_tree_object_visitor::call( const df::leaf_node_info& info) { - py::buffer_info value_ar_buf = this->value_ar.request(); - double* value_ar_ptr = static_cast(value_ar_buf.ptr); if (info.get_level() > 0) { std::size_t depth = static_cast(info.get_level()) - 1; @@ -258,7 +261,7 @@ bool to_sklearn_tree_object_visitor::call( OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, id, this->class_count); const auto row = id * 1 * this->class_count; OVERFLOW_CHECK_BY_ADDING(std::size_t, row, info.get_response()); - value_ar_ptr[row + info.get_response()] += info.get_sample_count(); + value_ar[row + info.get_response()] += info.get_sample_count(); if (depth == 0) { break; } @@ -278,7 +281,7 @@ template void init_get_tree_state(py::module_& m) { using namespace decision_forest; using model_t = model; - using tree_state_t = tree_state; + using tree_state_t = tree_state_py; // TODO: // create one instance for cls and reg. From fe16fc561acb76a14b083e38ebfad6fade5f4617 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 01:45:25 -0700 Subject: [PATCH 15/52] this-> overall --- onedal/primitives/tree_visitor.cpp | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index d4e76f9afc..6d8069cc7e 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -86,8 +86,8 @@ struct tree_state { template class tree_state_py { public: - py::array node_ar; - py::array value_ar; + py::array_t node_ar; + py::array_t value_ar; std::size_t max_depth; std::size_t node_count; std::size_t leaf_count; @@ -185,8 +185,8 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->leaf_count = _n_leafs; this->class_count = _max_n_classes; OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); - this->node_ar = new skl_tree_node[node_count]; - this->value_ar = new double[node_count*1*class_count](); // oneDAL only supports scalar responses for now + this->node_ar = new skl_tree_node[this->node_count]; + this->value_ar = new double[this->node_count*1*this->class_count](); // oneDAL only supports scalar responses for now } @@ -195,21 +195,21 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& if (info.get_level() > 0) { // has parents Py_ssize_t parent = parents[info.get_level() - 1]; - if (node_ar_ptr[parent].left_child > 0) { - assert(node_ar[node_id].right_child < 0); - node_ar[parent].right_child = node_id; + if (this->node_ar[parent].left_child > 0) { + assert(this->node_ar[node_id].right_child < 0); + this->node_ar[parent].right_child = node_id; } else { - node_ar[parent].left_child = node_id; + this->node_ar[parent].left_child = node_id; } } parents[info.get_level()] = node_id; - node_ar[node_id].feature = info.get_feature_index(); - node_ar[node_id].threshold = info.get_feature_value(); - node_ar[node_id].impurity = info.get_impurity(); - node_ar[node_id].n_node_samples = info.get_sample_count(); - node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar[node_id].missing_go_to_left = false; + this->node_ar[node_id].feature = info.get_feature_index(); + this->node_ar[node_id].threshold = info.get_feature_value(); + this->node_ar[node_id].impurity = info.get_impurity(); + this->node_ar[node_id].n_node_samples = info.get_sample_count(); + this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar[node_id].missing_go_to_left = false; // wrap-up ++node_id; @@ -222,19 +222,19 @@ void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info< if (info.get_level()) { Py_ssize_t parent = parents[info.get_level() - 1]; - if (node_ar[parent].left_child > 0) { - assert(node_ar[node_id].right_child < 0); - node_ar[parent].right_child = node_id; + if (this->node_ar[parent].left_child > 0) { + assert(this->node_ar[node_id].right_child < 0); + this->node_ar[parent].right_child = node_id; } else { - node_ar[parent].left_child = node_id; + this->node_ar[parent].left_child = node_id; } } - node_ar[node_id].impurity = info.get_impurity(); - node_ar[node_id].n_node_samples = info.get_sample_count(); - node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); - node_ar[node_id].missing_go_to_left = false; + this->node_ar[node_id].impurity = info.get_impurity(); + this->node_ar[node_id].n_node_samples = info.get_sample_count(); + this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar[node_id].missing_go_to_left = false; } template <> @@ -243,7 +243,7 @@ bool to_sklearn_tree_object_visitor::call( _onLeafNode(info); OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count); - value_ar[node_id * 1 * this->class_count] = info.get_response(); + this->value_ar[node_id * 1 * this->class_count] = info.get_response(); // wrap-up ++node_id; @@ -261,7 +261,7 @@ bool to_sklearn_tree_object_visitor::call( OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, id, this->class_count); const auto row = id * 1 * this->class_count; OVERFLOW_CHECK_BY_ADDING(std::size_t, row, info.get_response()); - value_ar[row + info.get_response()] += info.get_sample_count(); + this->value_ar[row + info.get_response()] += info.get_sample_count(); if (depth == 0) { break; } @@ -270,7 +270,7 @@ bool to_sklearn_tree_object_visitor::call( } _onLeafNode(info); OVERFLOW_CHECK_BY_ADDING(std::size_t, node_id * 1 * this->class_count, info.get_response()); - value_ar_ptr[node_id * 1 * this->class_count + info.get_response()] += info.get_sample_count(); + this->value_ar[node_id * 1 * this->class_count + info.get_response()] += info.get_sample_count(); // wrap-up ++node_id; From cd8c690c4e019b42cf590a0c417b3d424d646827 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 01:48:09 -0700 Subject: [PATCH 16/52] forgotten template --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 6d8069cc7e..a1fcedb137 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -93,7 +93,7 @@ class tree_state_py { std::size_t leaf_count; std::size_t class_count; - tree_state_py(tree_state inp){ + tree_state_py(tree_state inp){ this->max_depth = inp.max_depth; this->node_count = inp.node_count; this->leaf_count = inp.leaf_count; From 7faac32d60fd39303bd97de231bb15c1941f2cfb Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 02:50:54 -0700 Subject: [PATCH 17/52] change sample_weight checks --- onedal/ensemble/forest.py | 57 ++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index e4fb7c6eba..63a8cee3eb 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -305,40 +305,41 @@ def _validate_targets(self, y, dtype): self.classes_ = None return _column_or_1d(y, warn=True).astype(dtype, copy=False) - def _get_sample_weight(self, X, y, sample_weight): + def _get_sample_weight(self, sample_weight, X): n_samples = X.shape[0] dtype = X.dtype - if n_samples == 1: - raise ValueError("n_samples=1") + #if n_samples == 1: + # raise ValueError("n_samples=1") - sample_weight = np.asarray([] - if sample_weight is None - else sample_weight, dtype=dtype) - sample_weight = sample_weight.ravel() + #sample_weight = np.asarray([] + # if sample_weight is None + # else sample_weight, dtype=dtype) + sample_weight = np.asarray(sample_weight, dtype=dtype).ravel() - sample_weight_count = sample_weight.shape[0] - if sample_weight_count != 0 and sample_weight_count != n_samples: + if sample_weight.size != X.shape[0]: raise ValueError("sample_weight and X have incompatible shapes: " "%r vs %r\n" "Note: Sparse matrices cannot be indexed w/" "boolean masks (use `indices=True` in CV)." - % (len(sample_weight), X.shape)) + % (sample_weight.shape, X.shape)) - if sample_weight_count == 0: - sample_weight = np.ones(n_samples, dtype=dtype) - elif isinstance(sample_weight, Number): - sample_weight = np.full(n_samples, sample_weight, dtype=dtype) - else: + + #if sample_weight_count == 0: + # sample_weight = np.ones(n_samples, dtype=dtype) + #elif isinstance(sample_weight, Number): + # sample_weight = np.full(n_samples, sample_weight, dtype=dtype) + #else: + if True: sample_weight = _check_array( sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, order="C" ) - if sample_weight.ndim != 1: - raise ValueError("Sample weights must be 1D array or scalar") + #if sample_weight.ndim != 1: + # raise ValueError("Sample weights must be 1D array or scalar") - if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + #if sample_weight.shape != (n_samples,): + # raise ValueError("sample_weight.shape == {}, expected {}!" + # .format(sample_weight.shape, (n_samples,))) return sample_weight def _get_policy(self, queue, *data): @@ -349,17 +350,25 @@ def _fit(self, X, y, sample_weight, module, queue): X, y, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse='csr') y = self._validate_targets(y, X.dtype) - sample_weight = self._get_sample_weight(X, y, sample_weight) self.n_features_in_ = X.shape[1] if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ - policy = self._get_policy(queue, X, y, sample_weight) - X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight) + data = [X, y] + + if(sample_weight is not None and len(sample_weight > 0)): + sample_weight = self._get_sample_weight(sample_weight, X) + data.append(sample_weight) + + + policy = self._get_policy(queue, *data) + + #pass as *data + data = _convert_to_supported(policy, *data) params = self._get_onedal_params(X) train_result = module.train( - policy, params, *to_table(X, y, sample_weight)) + policy, params, *to_table(*data)) self._onedal_model = train_result.model if self.oob_score: From 3f1bd81be45e740eed2d581c6a0ac2bd857b0e0e Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:12:16 -0700 Subject: [PATCH 18/52] remove forced sample_weight use --- onedal/ensemble/forest.cpp | 11 +++++++++++ onedal/ensemble/forest.py | 27 +++++---------------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/onedal/ensemble/forest.cpp b/onedal/ensemble/forest.cpp index a79f6b26cd..86ac3610d7 100644 --- a/onedal/ensemble/forest.cpp +++ b/onedal/ensemble/forest.cpp @@ -216,6 +216,17 @@ void init_train_ops(py::module_& m) { train_ops ops(policy, input_t{ data, responses, weights }, params2desc{}); return fptype2t{ method2t{ Task{}, ops } }(params); }); + m.def("train", + [](const Policy& policy, + const py::dict& params, + const table& data, + const table& responses) { + using namespace decision_forest; + using input_t = train_input; + + train_ops ops(policy, input_t{ data, responses}, params2desc{}); + return fptype2t{ method2t{ Task{}, ops } }(params); + }); } template diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 63a8cee3eb..2047e80b9c 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -308,12 +308,7 @@ def _validate_targets(self, y, dtype): def _get_sample_weight(self, sample_weight, X): n_samples = X.shape[0] dtype = X.dtype - #if n_samples == 1: - # raise ValueError("n_samples=1") - #sample_weight = np.asarray([] - # if sample_weight is None - # else sample_weight, dtype=dtype) sample_weight = np.asarray(sample_weight, dtype=dtype).ravel() if sample_weight.size != X.shape[0]: @@ -323,23 +318,11 @@ def _get_sample_weight(self, sample_weight, X): "boolean masks (use `indices=True` in CV)." % (sample_weight.shape, X.shape)) + sample_weight = _check_array( + sample_weight, accept_sparse=False, ensure_2d=False, + dtype=dtype, order="C" + ) - #if sample_weight_count == 0: - # sample_weight = np.ones(n_samples, dtype=dtype) - #elif isinstance(sample_weight, Number): - # sample_weight = np.full(n_samples, sample_weight, dtype=dtype) - #else: - if True: - sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" - ) - #if sample_weight.ndim != 1: - # raise ValueError("Sample weights must be 1D array or scalar") - - #if sample_weight.shape != (n_samples,): - # raise ValueError("sample_weight.shape == {}, expected {}!" - # .format(sample_weight.shape, (n_samples,))) return sample_weight def _get_policy(self, queue, *data): @@ -357,7 +340,7 @@ def _fit(self, X, y, sample_weight, module, queue): data = [X, y] - if(sample_weight is not None and len(sample_weight > 0)): + if(sample_weight is not None and len(sample_weight) > 0): sample_weight = self._get_sample_weight(sample_weight, X) data.append(sample_weight) From 9e9c836770991445524db2c3c00c7941248e9645 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:29:20 -0700 Subject: [PATCH 19/52] PEP8 compliance --- onedal/ensemble/forest.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 2047e80b9c..ca532bd0a5 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -306,10 +306,7 @@ def _validate_targets(self, y, dtype): return _column_or_1d(y, warn=True).astype(dtype, copy=False) def _get_sample_weight(self, sample_weight, X): - n_samples = X.shape[0] - dtype = X.dtype - - sample_weight = np.asarray(sample_weight, dtype=dtype).ravel() + sample_weight = np.asarray(sample_weight, dtype=X.dtype).ravel() if sample_weight.size != X.shape[0]: raise ValueError("sample_weight and X have incompatible shapes: " @@ -340,11 +337,10 @@ def _fit(self, X, y, sample_weight, module, queue): data = [X, y] - if(sample_weight is not None and len(sample_weight) > 0): + if sample_weight is not None and len(sample_weight) > 0: sample_weight = self._get_sample_weight(sample_weight, X) data.append(sample_weight) - policy = self._get_policy(queue, *data) #pass as *data From bbf204013b113993335d2fa6d0e744ed4fdf8d05 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:30:39 -0700 Subject: [PATCH 20/52] codefactor-io compliance --- onedal/primitives/tree_visitor.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index a1fcedb137..88ff3a8a84 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -187,7 +187,6 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); this->node_ar = new skl_tree_node[this->node_count]; this->value_ar = new double[this->node_count*1*this->class_count](); // oneDAL only supports scalar responses for now - } template @@ -219,7 +218,6 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& // stuff that is done for all leaf node types template void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info& info) { - if (info.get_level()) { Py_ssize_t parent = parents[info.get_level() - 1]; if (this->node_ar[parent].left_child > 0) { From e71c436f83fe8dbd88b1a31605abf28b159a86ca Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 15 Jun 2023 03:31:55 -0700 Subject: [PATCH 21/52] PEP8 compliance --- onedal/ensemble/forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index ca532bd0a5..2ae988de07 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -317,7 +317,7 @@ def _get_sample_weight(self, sample_weight, X): sample_weight = _check_array( sample_weight, accept_sparse=False, ensure_2d=False, - dtype=dtype, order="C" + dtype=X.dtype, order="C" ) return sample_weight From 04371a918871f194ba2aa510a847dcc7ca3e31b0 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 20 Jun 2023 11:06:46 +0200 Subject: [PATCH 22/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: KulikovNikita --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 88ff3a8a84..d8a039f66e 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -241,7 +241,7 @@ bool to_sklearn_tree_object_visitor::call( _onLeafNode(info); OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count); - this->value_ar[node_id * 1 * this->class_count] = info.get_response(); + this->value_ar[node_id * this->class_count] = info.get_response(); // wrap-up ++node_id; From 41002381933aa838eab5b8aa60dd9913a857b277 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 20 Jun 2023 11:06:53 +0200 Subject: [PATCH 23/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: KulikovNikita --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index d8a039f66e..7706159d51 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -186,7 +186,7 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->class_count = _max_n_classes; OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); this->node_ar = new skl_tree_node[this->node_count]; - this->value_ar = new double[this->node_count*1*this->class_count](); // oneDAL only supports scalar responses for now + this->value_ar = new double[this->node_count * this->class_count](); // oneDAL only supports scalar responses for now } template From b0aa15c6d025594acdfc21d4eaa50502b01c0e8d Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Tue, 20 Jun 2023 11:06:59 +0200 Subject: [PATCH 24/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: KulikovNikita --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 7706159d51..c99aad47e4 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -268,7 +268,7 @@ bool to_sklearn_tree_object_visitor::call( } _onLeafNode(info); OVERFLOW_CHECK_BY_ADDING(std::size_t, node_id * 1 * this->class_count, info.get_response()); - this->value_ar[node_id * 1 * this->class_count + info.get_response()] += info.get_sample_count(); + this->value_ar[node_id * this->class_count + info.get_response()] += info.get_sample_count(); // wrap-up ++node_id; From 6b9edd682ca234fa5f86510f4a271afb7cc4fbbf Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 13 Jul 2023 07:22:43 -0700 Subject: [PATCH 25/52] refactor to remove numpy array deepcopy --- onedal/primitives/tree_visitor.cpp | 120 +++++++++++++---------------- scripts/build_backend.py | 1 + 2 files changed, 53 insertions(+), 68 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index c99aad47e4..7e0a88ea9d 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -75,45 +75,14 @@ class skl_tree_node { // We only expose the minimum information to python template struct tree_state { - skl_tree_node * node_ar; - double * value_ar; - std::size_t max_depth; - std::size_t node_count; - std::size_t leaf_count; - std::size_t class_count; -}; - -template -class tree_state_py { -public: py::array_t node_ar; py::array_t value_ar; std::size_t max_depth; std::size_t node_count; std::size_t leaf_count; std::size_t class_count; - - tree_state_py(tree_state inp){ - this->max_depth = inp.max_depth; - this->node_count = inp.node_count; - this->leaf_count = inp.leaf_count; - this->class_count = inp.class_count; - - auto node_ar_shape = py::array::ShapeContainer({ this->node_count }); - auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) }); - - auto value_ar_shape = py::array::ShapeContainer({ static_cast(this->node_count), - 1, - static_cast(this->class_count) }); - auto value_ar_strides = py::array::StridesContainer( - { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) }); - - this->node_ar = py::array_t(node_ar_shape, node_ar_strides, inp.node_ar); - this->value_ar = py::array_t(value_ar_shape, value_ar_strides, inp.value_ar); - } }; - // Declaration and implementation. template class node_count_visitor { @@ -170,6 +139,8 @@ class to_sklearn_tree_object_visitor : public tree_state { std::size_t max_n_classes; std::vector parents; void _onLeafNode(const df::leaf_node_info& info); + double* value_ar_ptr; + skl_tree_node* node_ar_ptr; }; template @@ -184,9 +155,26 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->max_depth = _depth; this->leaf_count = _n_leafs; this->class_count = _max_n_classes; + + auto node_ar_shape = py::array::ShapeContainer({ this->node_count }); + auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) }); + + auto value_ar_shape = py::array::ShapeContainer({ static_cast(this->node_count), + 1, + static_cast(this->class_count) }); + auto value_ar_strides = py::array::StridesContainer( + { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) }); + OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); - this->node_ar = new skl_tree_node[this->node_count]; - this->value_ar = new double[this->node_count * this->class_count](); // oneDAL only supports scalar responses for now + + this->node_ar = py::array_t(node_ar_shape, node_ar_strides); + this->value_ar = py::array_t(value_ar_shape, value_ar_strides); + + py::buffer_info node_ar_buf = this->node_ar.request(); + this->node_ar_ptr = static_cast(node_ar_buf.ptr); + + py::buffer_info value_ar_buf = this->value_ar.request(); + this->value_ar_ptr = static_cast(value_ar_buf.ptr); } template @@ -194,21 +182,21 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& if (info.get_level() > 0) { // has parents Py_ssize_t parent = parents[info.get_level() - 1]; - if (this->node_ar[parent].left_child > 0) { - assert(this->node_ar[node_id].right_child < 0); - this->node_ar[parent].right_child = node_id; + if (this->node_ar_ptr[parent].left_child > 0) { + assert(this->node_ar_ptr[node_id].right_child < 0); + this->node_ar_ptr[parent].right_child = node_id; } else { - this->node_ar[parent].left_child = node_id; + this->node_ar_ptr[parent].left_child = node_id; } } parents[info.get_level()] = node_id; - this->node_ar[node_id].feature = info.get_feature_index(); - this->node_ar[node_id].threshold = info.get_feature_value(); - this->node_ar[node_id].impurity = info.get_impurity(); - this->node_ar[node_id].n_node_samples = info.get_sample_count(); - this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); - this->node_ar[node_id].missing_go_to_left = false; + this->node_ar_ptr[node_id].feature = info.get_feature_index(); + this->node_ar_ptr[node_id].threshold = info.get_feature_value(); + this->node_ar_ptr[node_id].impurity = info.get_impurity(); + this->node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); + this->node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar_ptr[node_id].missing_go_to_left = false; // wrap-up ++node_id; @@ -220,19 +208,19 @@ template void to_sklearn_tree_object_visitor::_onLeafNode(const df::leaf_node_info& info) { if (info.get_level()) { Py_ssize_t parent = parents[info.get_level() - 1]; - if (this->node_ar[parent].left_child > 0) { - assert(this->node_ar[node_id].right_child < 0); - this->node_ar[parent].right_child = node_id; + if (this->node_ar_ptr[parent].left_child > 0) { + assert(this->node_ar_ptr[node_id].right_child < 0); + this->node_ar_ptr[parent].right_child = node_id; } else { - this->node_ar[parent].left_child = node_id; + this->node_ar_ptr[parent].left_child = node_id; } } - this->node_ar[node_id].impurity = info.get_impurity(); - this->node_ar[node_id].n_node_samples = info.get_sample_count(); - this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count(); - this->node_ar[node_id].missing_go_to_left = false; + this->node_ar_ptr[node_id].impurity = info.get_impurity(); + this->node_ar_ptr[node_id].n_node_samples = info.get_sample_count(); + this->node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count(); + this->node_ar_ptr[node_id].missing_go_to_left = false; } template <> @@ -241,7 +229,7 @@ bool to_sklearn_tree_object_visitor::call( _onLeafNode(info); OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count); - this->value_ar[node_id * this->class_count] = info.get_response(); + this->value_ar_ptr[node_id * this->class_count] = info.get_response(); // wrap-up ++node_id; @@ -252,23 +240,19 @@ template <> bool to_sklearn_tree_object_visitor::call( const df::leaf_node_info& info) { - if (info.get_level() > 0) { - std::size_t depth = static_cast(info.get_level()) - 1; - while (depth >= 0) { - const std::size_t id = parents[depth]; - OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, id, this->class_count); - const auto row = id * 1 * this->class_count; - OVERFLOW_CHECK_BY_ADDING(std::size_t, row, info.get_response()); - this->value_ar[row + info.get_response()] += info.get_sample_count(); - if (depth == 0) { - break; - } - --depth; - } + + std::size_t depth = static_cast(info.get_level()); + std::size_t label = info.get_response(); // these may be a slow accesses due to oneDAL abstraction + double nNodeSampleCount = info.get_sample_count(); // do them only once + + while(depth--) + { + const std::size_t id = parents[depth]; + const std::size_t row = id * this->class_count; + this->value_ar_ptr[row + label] += nNodeSampleCount; } _onLeafNode(info); - OVERFLOW_CHECK_BY_ADDING(std::size_t, node_id * 1 * this->class_count, info.get_response()); - this->value_ar[node_id * this->class_count + info.get_response()] += info.get_sample_count(); + this->value_ar_ptr[node_id * this->class_count + label] += nNodeSampleCount; // wrap-up ++node_id; @@ -279,7 +263,7 @@ template void init_get_tree_state(py::module_& m) { using namespace decision_forest; using model_t = model; - using tree_state_t = tree_state_py; + using tree_state_t = tree_state; // TODO: // create one instance for cls and reg. diff --git a/scripts/build_backend.py b/scripts/build_backend.py index b89ae07653..d390af1573 100755 --- a/scripts/build_backend.py +++ b/scripts/build_backend.py @@ -174,6 +174,7 @@ def custom_build_cmake_clib(iface, cxx=None, onedal_major_binary_version=1, no_d "-DoneDAL_INCLUDE_DIRS=" + jp(os.environ['DALROOT'], 'include'), "-DoneDAL_LIBRARY_DIR=" + jp(os.environ['DALROOT'], 'lib', 'intel64'), "-Dpybind11_DIR=" + pybind11.get_cmake_dir(), + "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON" ] if dpctl_available: From 2e62f12a113f6fb75e2056891ab695317ded9b1f Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 13 Jul 2023 07:33:36 -0700 Subject: [PATCH 26/52] merge correction --- onedal/primitives/tree_visitor.cpp | 33 +----------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index e4169ba309..7ad8b4747f 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -75,45 +75,14 @@ class skl_tree_node { // We only expose the minimum information to python template struct tree_state { - skl_tree_node * node_ar; - double * value_ar; - std::size_t max_depth; - std::size_t node_count; - std::size_t leaf_count; - std::size_t class_count; -}; - -template -class tree_state_py { -public: py::array_t node_ar; py::array_t value_ar; std::size_t max_depth; std::size_t node_count; std::size_t leaf_count; std::size_t class_count; - - tree_state_py(tree_state inp){ - this->max_depth = inp.max_depth; - this->node_count = inp.node_count; - this->leaf_count = inp.leaf_count; - this->class_count = inp.class_count; - - auto node_ar_shape = py::array::ShapeContainer({ this->node_count }); - auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) }); - - auto value_ar_shape = py::array::ShapeContainer({ static_cast(this->node_count), - 1, - static_cast(this->class_count) }); - auto value_ar_strides = py::array::StridesContainer( - { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) }); - - this->node_ar = py::array_t(node_ar_shape, node_ar_strides, inp.node_ar); - this->value_ar = py::array_t(value_ar_shape, value_ar_strides, inp.value_ar); - } }; - // Declaration and implementation. template class node_count_visitor { @@ -292,7 +261,7 @@ template void init_get_tree_state(py::module_& m) { using namespace decision_forest; using model_t = model; - using tree_state_t = tree_state_py; + using tree_state_t = tree_state; // TODO: // create one instance for cls and reg. From 7f2635f4b07ae6d367e506f3c813c3d6b3cce238 Mon Sep 17 00:00:00 2001 From: icfaust Date: Thu, 20 Jul 2023 16:04:59 -0700 Subject: [PATCH 27/52] parallelism implemented --- onedal/primitives/__init__.py | 5 +- onedal/primitives/get_tree.py | 12 +-- onedal/primitives/tree_visitor.cpp | 111 ++++++++++++++++------ sklearnex/preview/ensemble/extra_trees.py | 25 ++--- sklearnex/preview/ensemble/forest.py | 27 ++---- 5 files changed, 104 insertions(+), 76 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 3b1a72bffc..97cb625524 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -15,11 +15,10 @@ # =============================================================================== from .kernel_functions import linear_kernel, rbf_kernel, poly_kernel, sigmoid_kernel -from .get_tree import get_tree_state_cls, get_tree_state_reg +from .get_tree import get_forest_state __all__ = [ - 'get_tree_state_cls', - 'get_tree_state_reg', + 'get_forest_state', 'linear_kernel', 'rbf_kernel', 'poly_kernel', diff --git a/onedal/primitives/get_tree.py b/onedal/primitives/get_tree.py index 2ba33e1e61..9c2b8516b2 100644 --- a/onedal/primitives/get_tree.py +++ b/onedal/primitives/get_tree.py @@ -17,10 +17,8 @@ from onedal import _backend -def get_tree_state_cls(model, iTree, n_classes): - return _backend.get_tree.classification.get_tree_state( - model, iTree, n_classes) - - -def get_tree_state_reg(model, iTree): - return _backend.get_tree.regression.get_tree_state(model, iTree, 1) +def get_forest_state(model, n_classes=None): + if n_classes: + return _backend.get_tree.classification.get_all_states(model, n_classes) + else: + return _backend.get_tree.regression.get_all_states(model, 1) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 7ad8b4747f..e86b917128 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #define ONEDAL_PY_TERMINAL_NODE -1 #define ONEDAL_PY_NO_FEATURE -2 @@ -82,7 +83,7 @@ struct tree_state { std::size_t leaf_count; std::size_t class_count; }; - + // Declaration and implementation. template class node_count_visitor { @@ -100,6 +101,15 @@ class node_count_visitor { return true; } + /*node_count_visitor(node_count_visitor&&) = default; + bool operator()(const df::leaf_node_info& info) { + return call(info); + } + bool operator()(const df::split_node_info& info) { + return call(info); + }*/ + + std::size_t n_nodes; std::size_t depth; std::size_t n_leaf_nodes; @@ -133,14 +143,22 @@ class to_sklearn_tree_object_visitor : public tree_state { std::size_t _max_n_classes); bool call(const df::leaf_node_info& info); bool call(const df::split_node_info& info); + double* value_ar_ptr; + skl_tree_node* node_ar_ptr; + + /*to_sklearn_tree_object_visitor(to_sklearn_tree_object_visitor&&) = default; + bool operator()(const df::leaf_node_info& info) { + return call(info); + } + bool operator()(const df::split_node_info& info) { + return call(info); + }*/ protected: std::size_t node_id; std::size_t max_n_classes; std::vector parents; void _onLeafNode(const df::leaf_node_info& info); - double* value_ar_ptr; - skl_tree_node* node_ar_ptr; }; template @@ -167,14 +185,19 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count); - this->node_ar = py::array_t(node_ar_shape, node_ar_strides); - this->value_ar = py::array_t(value_ar_shape, value_ar_strides); + this->node_ar_ptr = new skl_tree_node[this->node_count]; + this->value_ar_ptr = new double[this->node_count*this->class_count](); + + // array_t doesn't initialize the underlying memory with the object's constructor + // so the values will not match what is defined above, must be done on C++ side + this->node_ar = py::array_t(node_ar_shape, node_ar_strides, this->node_ar_ptr, py::none()); + this->value_ar = py::array_t(value_ar_shape, value_ar_strides, this->value_ar_ptr, py::none()); py::buffer_info node_ar_buf = this->node_ar.request(); - this->node_ar_ptr = static_cast(node_ar_buf.ptr); + //this->node_ar_ptr = static_cast(node_ar_buf.ptr); py::buffer_info value_ar_buf = this->value_ar.request(); - this->value_ar_ptr = static_cast(value_ar_buf.ptr); + //this->value_ar_ptr = static_cast(value_ar_buf.ptr); } template @@ -190,6 +213,7 @@ bool to_sklearn_tree_object_visitor::call(const df::split_node_info& this->node_ar_ptr[parent].left_child = node_id; } } + parents[info.get_level()] = node_id; this->node_ar_ptr[node_id].feature = info.get_feature_index(); this->node_ar_ptr[node_id].threshold = info.get_feature_value(); @@ -239,6 +263,7 @@ bool to_sklearn_tree_object_visitor::call( template <> bool to_sklearn_tree_object_visitor::call( const df::leaf_node_info& info) { + std::size_t depth = static_cast(info.get_level()); std::size_t label = info.get_response(); // these may be a slow accesses due to oneDAL abstraction double nNodeSampleCount = info.get_sample_count(); // do them only once @@ -257,36 +282,60 @@ bool to_sklearn_tree_object_visitor::call( return true; } +template +py::list get_all_states(const decision_forest::model& model, std::size_t n_classes) { + using ncv_dec = node_visitor>; + using tsv_dec = node_visitor>; + + std::size_t tree_count = model.get_tree_count(); + std::vector> ncvs(tree_count, node_count_visitor()); + std::vector ncv_decorators; + for(std::size_t i=0; i < tree_count; i++){ + ncv_decorators.push_back(ncv_dec{&ncvs[i]}); + } + + model.template traverse_depth_first, ncv_dec> (std::move(ncv_decorators)); + + // generate memory block here + py::list output; + + // this may be slow based on the memory allocation + std::vector> tsvs; + std::vector tsv_decorators; + for(std::size_t i=0; i < tree_count; i++){ + tsvs.push_back(to_sklearn_tree_object_visitor(ncvs[i].depth, + ncvs[i].n_nodes, + ncvs[i].n_leaf_nodes, + n_classes)); + } + // must be done separately due to the nature of the decorators and a constant pointer vs vector push back + for(std::size_t i=0; i < tree_count; i++){ + tsv_decorators.push_back(tsv_dec{&tsvs[i]}); + } + + model.template traverse_depth_first, tsv_dec>(std::move(tsv_decorators)); + + // create list here + for( std::size_t i=0; i < tree_count; i++){ + py::dict est_tree_state; + est_tree_state["max_depth"] = tsvs[i].max_depth; + est_tree_state["node_count"] = tsvs[i].node_count; + est_tree_state["nodes"] = tsvs[i].node_ar; + est_tree_state["values"] = tsvs[i].value_ar; + output.append(est_tree_state); + } + + return output; +} + + template void init_get_tree_state(py::module_& m) { using namespace decision_forest; using model_t = model; using tree_state_t = tree_state; - // TODO: - // create one instance for cls and reg. - py::class_(m, "get_tree_state") - .def(py::init([](const model_t& model, std::size_t iTree, std::size_t n_classes) { - // First count nodes - node_count_visitor ncv; - node_visitor ncv_decorator{ &ncv }; - - model.traverse_depth_first(iTree, std::move(ncv_decorator)); - // then do the final tree traversal - to_sklearn_tree_object_visitor tsv(ncv.depth, - ncv.n_nodes, - ncv.n_leaf_nodes, - n_classes); - node_visitor tsv_decorator{ &tsv }; - model.traverse_depth_first(iTree, std::move(tsv_decorator)); - return tree_state_t(tsv); - })) - .def_readwrite("node_ar", &tree_state_t::node_ar, py::return_value_policy::take_ownership) - .def_readwrite("value_ar", &tree_state_t::value_ar, py::return_value_policy::take_ownership) - .def_readwrite("max_depth", &tree_state_t::max_depth) - .def_readwrite("node_count", &tree_state_t::node_count) - .def_readwrite("leaf_count", &tree_state_t::leaf_count) - .def_readwrite("class_count", &tree_state_t::class_count); + m.def("get_all_states", &get_all_states, py::return_value_policy::take_ownership); } ONEDAL_PY_TYPE2STR(decision_forest::task::classification, "classification"); diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index 5ac48a3e62..e68ba21ae6 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -53,7 +53,7 @@ from onedal.ensemble import ExtraTreesClassifier as onedal_ExtraTreesClassifier from onedal.ensemble import ExtraTreesRegressor as onedal_ExtraTreesRegressor -from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.primitives import get_forest_state from scipy import sparse as sp @@ -520,6 +520,7 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) + allstates = get_forest_state(self._onedal_model, n_classes_) for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( @@ -533,20 +534,14 @@ def _estimators_(self): est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) - tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) est_i.tree_ = Tree( self.n_features_in_, np.array( [n_classes_], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(tree_i_state_dict) + est_i.tree_.__setstate__(allstates[i]) estimators_.append(est_i) self._cached_estimators_ = estimators_ @@ -899,6 +894,7 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) + allstates = get_all_states(self._onedal_model) for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( @@ -911,18 +907,11 @@ def _estimators_(self): est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) - tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} - + allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) est_i.tree_ = Tree( self.n_features_in_, np.array( [1], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(tree_i_state_dict) + est_i.tree_.__setstate__(allstates[i]) estimators_.append(est_i) return estimators_ diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 4ff88fc6f9..b9487cdcb0 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -43,6 +43,7 @@ check_X_y) from onedal.datatypes import _num_features, _num_samples +from onedal import _backend from sklearn.utils import check_random_state, deprecated @@ -53,7 +54,7 @@ from onedal.ensemble import RandomForestClassifier as onedal_RandomForestClassifier from onedal.ensemble import RandomForestRegressor as onedal_RandomForestRegressor -from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.primitives import get_forest_state from scipy import sparse as sp @@ -499,6 +500,8 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) + allstates = get_forest_state(self._onedal_model, n_classes_) + for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( @@ -512,20 +515,14 @@ def _estimators_(self): est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) - tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) est_i.tree_ = Tree( self.n_features_in_, np.array( [n_classes_], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(tree_i_state_dict) + est_i.tree_.__setstate__(allstates[i]) estimators_.append(est_i) self._cached_estimators_ = estimators_ @@ -875,6 +872,8 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) + allstates = get_forest_state(self._onedal_model) + for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( @@ -887,18 +886,12 @@ def _estimators_(self): est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) - tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) est_i.tree_ = Tree( self.n_features_in_, np.array( [1], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(tree_i_state_dict) + est_i.tree_.__setstate__(allstates[i]) estimators_.append(est_i) return estimators_ From 7bca052443efe8f8ec547956fe3feb9ce25ee30a Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 05:36:46 -0700 Subject: [PATCH 28/52] changes necessary for onedal versioning --- onedal/primitives/__init__.py | 10 ++++-- onedal/primitives/get_tree.py | 21 ++++++++--- onedal/primitives/tree_visitor.cpp | 34 ++++++++++++++++-- sklearnex/preview/ensemble/extra_trees.py | 44 +++++++++++++++++++---- sklearnex/preview/ensemble/forest.py | 37 +++++++++++++++---- 5 files changed, 124 insertions(+), 22 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 97cb625524..11a4df3c98 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,12 +14,18 @@ # limitations under the License. # =============================================================================== +from daal4py.sklearn._utils import daal_check_version from .kernel_functions import linear_kernel, rbf_kernel, poly_kernel, sigmoid_kernel -from .get_tree import get_forest_state +from .get_tree import get_tree_state_cls, get_tree_state_reg __all__ = [ - 'get_forest_state', + 'get_tree_state_cls', + 'get_tree_state_reg', 'linear_kernel', 'rbf_kernel', 'poly_kernel', 'sigmoid_kernel'] + +if daal_check_version((2023, 'P', 301)): + from .get_tree import get_forest_state + __all__ += ['get_forest_state'] \ No newline at end of file diff --git a/onedal/primitives/get_tree.py b/onedal/primitives/get_tree.py index 9c2b8516b2..07b7ba52a6 100644 --- a/onedal/primitives/get_tree.py +++ b/onedal/primitives/get_tree.py @@ -14,11 +14,22 @@ # limitations under the License. # =============================================================================== +from daal4py.sklearn._utils import daal_check_version from onedal import _backend -def get_forest_state(model, n_classes=None): - if n_classes: - return _backend.get_tree.classification.get_all_states(model, n_classes) - else: - return _backend.get_tree.regression.get_all_states(model, 1) +def get_tree_state_cls(model, iTree, n_classes): + return _backend.get_tree.classification.get_tree_state( + model, iTree, n_classes) + + +def get_tree_state_reg(model, iTree): + return _backend.get_tree.regression.get_tree_state(model, iTree, 1) + + +if daal_check_version((2023, 'P', 301)): + def get_forest_state(model, n_classes=None): + if n_classes: + return _backend.get_tree.classification.get_all_states(model, n_classes) + else: + return _backend.get_tree.regression.get_all_states(model, 1) \ No newline at end of file diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index e86b917128..4b59c1bdbf 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -18,6 +18,7 @@ #include "onedal/common.hpp" #include "oneapi/dal/algo/decision_forest.hpp" #include "numpy/arrayobject.h" +#include "onedal/version.hpp" #include #include @@ -282,6 +283,7 @@ bool to_sklearn_tree_object_visitor::call( return true; } +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 template py::list get_all_states(const decision_forest::model& model, std::size_t n_classes) { using ncv_dec = node_visitor>; @@ -327,15 +329,43 @@ py::list get_all_states(const decision_forest::model& model, std::size_t n return output; } +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 template void init_get_tree_state(py::module_& m) { using namespace decision_forest; using model_t = model; - using tree_state_t = tree_state; - + using tree_state_t = tree_state; + + // TODO: + // create one instance for cls and reg. + py::class_(m, "get_tree_state") + .def(py::init([](const model_t& model, std::size_t iTree, std::size_t n_classes) { + // First count nodes + node_count_visitor ncv; + node_visitor ncv_decorator{ &ncv }; + + model.traverse_depth_first(iTree, std::move(ncv_decorator)); + // then do the final tree traversal + to_sklearn_tree_object_visitor tsv(ncv.depth, + ncv.n_nodes, + ncv.n_leaf_nodes, + n_classes); + node_visitor tsv_decorator{ &tsv }; + model.traverse_depth_first(iTree, std::move(tsv_decorator)); + return tree_state_t(tsv); + })) + .def_readwrite("node_ar", &tree_state_t::node_ar, py::return_value_policy::take_ownership) + .def_readwrite("value_ar", &tree_state_t::value_ar, py::return_value_policy::take_ownership) + .def_readwrite("max_depth", &tree_state_t::max_depth) + .def_readwrite("node_count", &tree_state_t::node_count) + .def_readwrite("leaf_count", &tree_state_t::leaf_count) + .def_readwrite("class_count", &tree_state_t::class_count); + +#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 m.def("get_all_states", &get_all_states, py::return_value_policy::take_ownership); +#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 } ONEDAL_PY_TYPE2STR(decision_forest::task::classification, "classification"); diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index e68ba21ae6..cdec93d17a 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -53,13 +53,16 @@ from onedal.ensemble import ExtraTreesClassifier as onedal_ExtraTreesClassifier from onedal.ensemble import ExtraTreesRegressor as onedal_ExtraTreesRegressor -from onedal.primitives import get_forest_state +from onedal.primitives import get_tree_state_cls, get_tree_state_reg from scipy import sparse as sp if sklearn_check_version('1.2'): from sklearn.utils._param_validation import Interval +if daal_check_version((2023, 'P', 301)): + from onedal.primitives import get_forest_state + class BaseTree(ABC): def _fit_proba(self, X, y, sample_weight=None, queue=None): @@ -519,8 +522,11 @@ def _estimators_(self): # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution estimators_ = [] + random_state_checked = check_random_state(self.random_state) - allstates = get_forest_state(self._onedal_model, n_classes_) + if daal_check_version((2023, 'P', 301)): + allstates = get_forest_state(self._onedal_model, n_classes_) + for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( @@ -534,14 +540,25 @@ def _estimators_(self): est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) + if daal_check_version((2023, 'P', 301)): + tree_i_state_dict = allstates[i] + tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + else: + tree_i_state_class = get_tree_state_cls( + self._onedal_model, i, n_classes_) + tree_i_state_dict = { + 'max_depth': tree_i_state_class.max_depth, + 'node_count': tree_i_state_class.node_count, + 'nodes': check_tree_nodes(tree_i_state_class.node_ar), + 'values': tree_i_state_class.value_ar} + est_i.tree_ = Tree( self.n_features_in_, np.array( [n_classes_], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(allstates[i]) + est_i.tree_.__setstate__(tree_state_dict) estimators_.append(est_i) self._cached_estimators_ = estimators_ @@ -894,7 +911,9 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - allstates = get_all_states(self._onedal_model) + if daal_check_version((2023, 'P', 301)): + allstates = get_forest_state(self._onedal_model) + for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( @@ -907,11 +926,22 @@ def _estimators_(self): est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) + if daal_check_version((2023, 'P', 301)): + tree_i_state_dict = allstates[i] + tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + else: + tree_i_state_class = get_tree_state_reg( + self._onedal_model, i) + tree_i_state_dict = { + 'max_depth': tree_i_state_class.max_depth, + 'node_count': tree_i_state_class.node_count, + 'nodes': check_tree_nodes(tree_i_state_class.node_ar), + 'values': tree_i_state_class.value_ar} + est_i.tree_ = Tree( self.n_features_in_, np.array( [1], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(allstates[i]) + est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) return estimators_ diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index b9487cdcb0..6009c640d6 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -54,13 +54,16 @@ from onedal.ensemble import RandomForestClassifier as onedal_RandomForestClassifier from onedal.ensemble import RandomForestRegressor as onedal_RandomForestRegressor -from onedal.primitives import get_forest_state +from onedal.primitives import get_tree_state_cls, get_tree_state_reg from scipy import sparse as sp if sklearn_check_version('1.2'): from sklearn.utils._param_validation import Interval, StrOptions +if daal_check_version((2023, 'P', 301)): + from onedal.primitives import get_forest_state + class BaseRandomForest(ABC): def _fit_proba(self, X, y, sample_weight=None, queue=None): @@ -500,7 +503,8 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - allstates = get_forest_state(self._onedal_model, n_classes_) + if daal_check_version((2023, 'P', 301)): + allstates = get_forest_state(self._onedal_model, n_classes_) for i in range(self.n_estimators): est_i = clone(est) @@ -515,14 +519,25 @@ def _estimators_(self): est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) + if daal_check_version((2023, 'P', 301)): + tree_i_state_dict = allstates[i] + tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + else: + tree_i_state_class = get_tree_state_cls( + self._onedal_model, i, n_classes_) + tree_i_state_dict = { + 'max_depth': tree_i_state_class.max_depth, + 'node_count': tree_i_state_class.node_count, + 'nodes': check_tree_nodes(tree_i_state_class.node_ar), + 'values': tree_i_state_class.value_ar} + est_i.tree_ = Tree( self.n_features_in_, np.array( [n_classes_], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(allstates[i]) + est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) self._cached_estimators_ = estimators_ @@ -886,12 +901,22 @@ def _estimators_(self): est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - allstates[i]['nodes'] = check_tree_nodes(allstates[i]['nodes']) + if daal_check_version((2023, 'P', 301)): + tree_i_state_dict = allstates[i] + tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + else: + tree_i_state_class = get_tree_state_reg( + self._onedal_model, i) + tree_i_state_dict = { + 'max_depth': tree_i_state_class.max_depth, + 'node_count': tree_i_state_class.node_count, + 'nodes': check_tree_nodes(tree_i_state_class.node_ar), + 'values': tree_i_state_class.value_ar} est_i.tree_ = Tree( self.n_features_in_, np.array( [1], dtype=np.intp), self.n_outputs_) - est_i.tree_.__setstate__(allstates[i]) + est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) return estimators_ From 0116e1134ba17036f0f5a09bfdd2e36fb16f27b1 Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 06:20:09 -0700 Subject: [PATCH 29/52] review requested changes --- onedal/ensemble/forest.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 2ae988de07..3a5d405529 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -308,6 +308,11 @@ def _validate_targets(self, y, dtype): def _get_sample_weight(self, sample_weight, X): sample_weight = np.asarray(sample_weight, dtype=X.dtype).ravel() + sample_weight = _check_array( + sample_weight, accept_sparse=False, ensure_2d=False, + dtype=X.dtype, order="C" + ) + if sample_weight.size != X.shape[0]: raise ValueError("sample_weight and X have incompatible shapes: " "%r vs %r\n" @@ -315,11 +320,6 @@ def _get_sample_weight(self, sample_weight, X): "boolean masks (use `indices=True` in CV)." % (sample_weight.shape, X.shape)) - sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=X.dtype, order="C" - ) - return sample_weight def _get_policy(self, queue, *data): From 010c2dc5751659833e46b01d14e3b713913f57e5 Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 06:37:40 -0700 Subject: [PATCH 30/52] review requsted change expand --- onedal/ensemble/forest.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 3a5d405529..ea9cf6a8d4 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -335,19 +335,21 @@ def _fit(self, X, y, sample_weight, module, queue): if not sklearn_check_version('1.0'): self.n_features_ = self.n_features_in_ - data = [X, y] - if sample_weight is not None and len(sample_weight) > 0: sample_weight = self._get_sample_weight(sample_weight, X) - data.append(sample_weight) + policy = self._get_policy(queue, X, y, sample_weight) + X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight) + params = self._get_onedal_params(X) + train_result = module.train( + policy, params, *to_table(X, y, sample_weight)) - policy = self._get_policy(queue, *data) + else: + policy = self._get_policy(queue, X, y) + X, y = = _convert_to_supported(policy, X, y) + params = self._get_onedal_params(X) + train_result = module.train( + policy, params, *to_table(X, y)) - #pass as *data - data = _convert_to_supported(policy, *data) - params = self._get_onedal_params(X) - train_result = module.train( - policy, params, *to_table(*data)) self._onedal_model = train_result.model if self.oob_score: From 7095110a3966bdf9e6f0052c612434aa3df33968 Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 06:39:44 -0700 Subject: [PATCH 31/52] beautiful white spaces --- onedal/primitives/__init__.py | 2 +- onedal/primitives/get_tree.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 11a4df3c98..0a09c578c9 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -28,4 +28,4 @@ if daal_check_version((2023, 'P', 301)): from .get_tree import get_forest_state - __all__ += ['get_forest_state'] \ No newline at end of file + __all__ += ['get_forest_state'] diff --git a/onedal/primitives/get_tree.py b/onedal/primitives/get_tree.py index 07b7ba52a6..202244496c 100644 --- a/onedal/primitives/get_tree.py +++ b/onedal/primitives/get_tree.py @@ -32,4 +32,4 @@ def get_forest_state(model, n_classes=None): if n_classes: return _backend.get_tree.classification.get_all_states(model, n_classes) else: - return _backend.get_tree.regression.get_all_states(model, 1) \ No newline at end of file + return _backend.get_tree.regression.get_all_states(model, 1) From 274816ef16496a52cc76a5b8e6a75ab676a8ba78 Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 06:50:38 -0700 Subject: [PATCH 32/52] the snap --- onedal/ensemble/forest.py | 620 ++++++------ onedal/primitives/__init__.py | 18 +- scripts/build_backend.py | 3 +- sklearnex/preview/ensemble/extra_trees.py | 1120 ++++++++++++--------- sklearnex/preview/ensemble/forest.py | 824 ++++++++------- 5 files changed, 1434 insertions(+), 1151 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index d1341c0caa..940500f36d 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -14,8 +14,7 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version) +from daal4py.sklearn._utils import daal_check_version, sklearn_check_version from abc import ABCMeta, abstractmethod import numbers @@ -26,11 +25,13 @@ check_random_state, compute_sample_weight, check_array, - deprecated) + deprecated, +) from sklearn.utils.validation import ( check_is_fitted, check_consistent_length, - _num_samples) + _num_samples, +) from math import ceil import numpy as np @@ -39,16 +40,13 @@ from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy from ..common._estimator_checks import _check_is_fitted -from ..datatypes import ( - from_table, - to_table, - _convert_to_supported) +from ..datatypes import from_table, to_table, _convert_to_supported from ..utils import ( _validate_targets, _check_X_y, _check_array, _column_or_1d, - _check_n_features + _check_n_features, ) from onedal import _backend @@ -58,33 +56,34 @@ class BaseForest(BaseEnsemble, metaclass=ABCMeta): @abstractmethod def __init__( - self, - n_estimators, - criterion, - max_depth, - min_samples_split, - min_samples_leaf, - min_weight_fraction_leaf, - max_features, - max_leaf_nodes, - min_impurity_decrease, - min_impurity_split, - bootstrap, - oob_score, - random_state, - warm_start, - class_weight, - ccp_alpha, - max_samples, - max_bins, - min_bin_size, - infer_mode, - splitter_mode, - voting_mode, - error_metric_mode, - variable_importance_mode, - algorithm, - **kwargs): + self, + n_estimators, + criterion, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_features, + max_leaf_nodes, + min_impurity_decrease, + min_impurity_split, + bootstrap, + oob_score, + random_state, + warm_start, + class_weight, + ccp_alpha, + max_samples, + max_bins, + min_bin_size, + infer_mode, + splitter_mode, + voting_mode, + error_metric_mode, + variable_importance_mode, + algorithm, + **kwargs, + ): self.n_estimators = n_estimators self.bootstrap = bootstrap self.oob_score = oob_score @@ -111,32 +110,41 @@ def __init__( self.variable_importance_mode = variable_importance_mode self.algorithm = algorithm - def _to_absolute_max_features(self, max_features, n_features, - is_classification=False): + def _to_absolute_max_features( + self, max_features, n_features, is_classification=False + ): if max_features is None: return n_features if isinstance(max_features, str): if max_features == "auto": - if not sklearn_check_version('1.3'): - if sklearn_check_version('1.1'): + if not sklearn_check_version("1.3"): + if sklearn_check_version("1.1"): warnings.warn( "`max_features='auto'` has been deprecated in 1.1 " "and will be removed in 1.3. To keep the past behaviour, " "explicitly set `max_features=1.0` or remove this " "parameter as it is also the default value for " "RandomForestRegressors and ExtraTreesRegressors.", - FutureWarning, ) - return max(1, int(np.sqrt(n_features)) - ) if is_classification else n_features - if max_features == 'sqrt': + FutureWarning, + ) + return ( + max(1, int(np.sqrt(n_features))) + if is_classification + else n_features + ) + if max_features == "sqrt": return max(1, int(np.sqrt(n_features))) if max_features == "log2": return max(1, int(np.log2(n_features))) - allowed_string_values = '"sqrt" or "log2"' if sklearn_check_version( - '1.3') else '"auto", "sqrt" or "log2"' + allowed_string_values = ( + '"sqrt" or "log2"' + if sklearn_check_version("1.3") + else '"auto", "sqrt" or "log2"' + ) raise ValueError( - 'Invalid value for max_features. Allowed string ' - f'values are {allowed_string_values}.') + "Invalid value for max_features. Allowed string " + f"values are {allowed_string_values}." + ) if isinstance(max_features, (numbers.Integral, np.integer)): return max_features if max_features > 0.0: @@ -145,10 +153,10 @@ def _to_absolute_max_features(self, max_features, n_features, def _get_observations_per_tree_fraction(self, n_samples, max_samples): if max_samples is None: - return 1. + return 1.0 if isinstance(max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, max_samples)) @@ -159,9 +167,9 @@ def _get_observations_per_tree_fraction(self, n_samples, max_samples): return max(float(max_samples / n_samples), 1 / n_samples) if isinstance(max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(max_samples)) @@ -177,12 +185,15 @@ def _get_observations_per_tree_fraction(self, n_samples, max_samples): def _get_onedal_params(self, data): n_samples, n_features = data.shape features_per_node = self._to_absolute_max_features( - self.max_features, n_features, self.is_classification) + self.max_features, n_features, self.is_classification + ) observations_per_tree_fraction = self._get_observations_per_tree_fraction( - n_samples=n_samples, max_samples=self.max_samples) - observations_per_tree_fraction = observations_per_tree_fraction if bool( - self.bootstrap) else 1. + n_samples=n_samples, max_samples=self.max_samples + ) + observations_per_tree_fraction = ( + observations_per_tree_fraction if bool(self.bootstrap) else 1.0 + ) if not self.bootstrap and self.max_samples is not None: raise ValueError( @@ -191,116 +202,126 @@ def _get_onedal_params(self, data): "`max_sample=None`." ) if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") min_observations_in_leaf_node = ( - self.min_samples_leaf if isinstance( - self.min_samples_leaf, - numbers.Integral) else int( - ceil( - self.min_samples_leaf * n_samples))) + self.min_samples_leaf + if isinstance(self.min_samples_leaf, numbers.Integral) + else int(ceil(self.min_samples_leaf * n_samples)) + ) min_observations_in_split_node = ( - self.min_samples_split if isinstance( - self.min_samples_split, - numbers.Integral) else int( - ceil( - self.min_samples_split * n_samples))) + self.min_samples_split + if isinstance(self.min_samples_split, numbers.Integral) + else int(ceil(self.min_samples_split * n_samples)) + ) onedal_params = { - 'fptype': 'float' if data.dtype == np.float32 else 'double', - 'method': self.algorithm, - 'infer_mode': self.infer_mode, - 'voting_mode': self.voting_mode, - 'observations_per_tree_fraction': observations_per_tree_fraction, - 'impurity_threshold': float( - 0.0 if self.min_impurity_split is None else self.min_impurity_split), - 'min_weight_fraction_in_leaf_node': self.min_weight_fraction_leaf, - 'min_impurity_decrease_in_split_node': self.min_impurity_decrease, - 'tree_count': int(self.n_estimators), - 'features_per_node': features_per_node, - 'max_tree_depth': int(0 if self.max_depth is None else self.max_depth), - 'min_observations_in_leaf_node': min_observations_in_leaf_node, - 'min_observations_in_split_node': min_observations_in_split_node, - 'max_leaf_nodes': (0 if self.max_leaf_nodes is None else self.max_leaf_nodes), - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'memory_saving_mode': False, - 'bootstrap': bool(self.bootstrap), - 'error_metric_mode': self.error_metric_mode, - 'variable_importance_mode': self.variable_importance_mode, + "fptype": "float" if data.dtype == np.float32 else "double", + "method": self.algorithm, + "infer_mode": self.infer_mode, + "voting_mode": self.voting_mode, + "observations_per_tree_fraction": observations_per_tree_fraction, + "impurity_threshold": float( + 0.0 if self.min_impurity_split is None else self.min_impurity_split + ), + "min_weight_fraction_in_leaf_node": self.min_weight_fraction_leaf, + "min_impurity_decrease_in_split_node": self.min_impurity_decrease, + "tree_count": int(self.n_estimators), + "features_per_node": features_per_node, + "max_tree_depth": int(0 if self.max_depth is None else self.max_depth), + "min_observations_in_leaf_node": min_observations_in_leaf_node, + "min_observations_in_split_node": min_observations_in_split_node, + "max_leaf_nodes": (0 if self.max_leaf_nodes is None else self.max_leaf_nodes), + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "memory_saving_mode": False, + "bootstrap": bool(self.bootstrap), + "error_metric_mode": self.error_metric_mode, + "variable_importance_mode": self.variable_importance_mode, } if self.is_classification: - onedal_params['class_count'] = 0 if self.classes_ is None else len( - self.classes_) - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + onedal_params["class_count"] = ( + 0 if self.classes_ is None else len(self.classes_) + ) + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode return onedal_params def _check_parameters(self): if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if self.min_impurity_split is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if self.min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if self.min_impurity_split < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def _validate_targets(self, y, dtype): self.class_weight_ = None @@ -311,16 +332,17 @@ def _get_sample_weight(self, sample_weight, X): sample_weight = np.asarray(sample_weight, dtype=X.dtype).ravel() sample_weight = _check_array( - sample_weight, accept_sparse=False, ensure_2d=False, - dtype=X.dtype, order="C" + sample_weight, accept_sparse=False, ensure_2d=False, dtype=X.dtype, order="C" ) if sample_weight.size != X.shape[0]: - raise ValueError("sample_weight and X have incompatible shapes: " - "%r vs %r\n" - "Note: Sparse matrices cannot be indexed w/" - "boolean masks (use `indices=True` in CV)." - % (sample_weight.shape, X.shape)) + raise ValueError( + "sample_weight and X have incompatible shapes: " + "%r vs %r\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV)." + % (sample_weight.shape, X.shape) + ) return sample_weight @@ -329,12 +351,16 @@ def _get_policy(self, queue, *data): def _fit(self, X, y, sample_weight, module, queue): X, y = _check_X_y( - X, y, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse='csr') + X, + y, + dtype=[np.float64, np.float32], + force_all_finite=True, + accept_sparse="csr", + ) y = self._validate_targets(y, X.dtype) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ if sample_weight is not None and len(sample_weight) > 0: @@ -342,15 +368,13 @@ def _fit(self, X, y, sample_weight, module, queue): policy = self._get_policy(queue, X, y, sample_weight) X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight) params = self._get_onedal_params(X) - train_result = module.train( - policy, params, *to_table(X, y, sample_weight)) + train_result = module.train(policy, params, *to_table(X, y, sample_weight)) else: policy = self._get_policy(queue, X, y) - X, y = = _convert_to_supported(policy, X, y) + X, y = _convert_to_supported(policy, X, y) params = self._get_onedal_params(X) - train_result = module.train( - policy, params, *to_table(X, y)) + train_result = module.train(policy, params, *to_table(X, y)) self._onedal_model = train_result.model @@ -358,7 +382,8 @@ def _fit(self, X, y, sample_weight, module, queue): if self.is_classification: self.oob_score_ = from_table(train_result.oob_err_accuracy)[0, 0] self.oob_decision_function_ = from_table( - train_result.oob_err_decision_function) + train_result.oob_err_decision_function + ) if np.any(self.oob_decision_function_ == 0): warnings.warn( "Some inputs do not have OOB scores. This probably means " @@ -369,7 +394,8 @@ def _fit(self, X, y, sample_weight, module, queue): else: self.oob_score_ = from_table(train_result.oob_err_r2)[0, 0] self.oob_prediction_ = from_table( - train_result.oob_err_prediction).reshape(-1) + train_result.oob_err_prediction + ).reshape(-1) if np.any(self.oob_prediction_ == 0): warnings.warn( "Some inputs do not have OOB scores. This probably means " @@ -383,12 +409,13 @@ def _fit(self, X, y, sample_weight, module, queue): def _create_model(self, module): # TODO: # upate error msg. - raise NotImplementedError('Creating model is not supported.') + raise NotImplementedError("Creating model is not supported.") def _predict(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse=False) + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse=False + ) _check_n_features(self, X, False) policy = self._get_policy(queue, X) @@ -401,13 +428,14 @@ def _predict(self, X, module, queue): def _predict_proba(self, X, module, queue): _check_is_fitted(self) - X = _check_array(X, dtype=[np.float64, np.float32], - force_all_finite=True, accept_sparse=False) + X = _check_array( + X, dtype=[np.float64, np.float32], force_all_finite=True, accept_sparse=False + ) _check_n_features(self, X, False) policy = self._get_policy(queue, X) X = _convert_to_supported(policy, X) params = self._get_onedal_params(X) - params['infer_mode'] = 'class_probabilities' + params["infer_mode"] = "class_probabilities" model = self._onedal_model result = module.infer(policy, params, model, to_table(X)) @@ -416,33 +444,35 @@ def _predict_proba(self, X, module, queue): class RandomForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='best', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="best", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -468,12 +498,14 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = True def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) # Decapsulate classes_ attributes # TODO: @@ -483,50 +515,49 @@ def _validate_targets(self, y, dtype): return y def fit(self, X, y, sample_weight=None, queue=None): - return self._fit(X, y, sample_weight, - _backend.decision_forest.classification, queue) + return self._fit( + X, y, sample_weight, _backend.decision_forest.classification, queue + ) def predict(self, X, queue=None): pred = super()._predict(X, _backend.decision_forest.classification, queue) - return np.take( - self.classes_, - pred.ravel().astype( - np.int64, - casting='unsafe')) + return np.take(self.classes_, pred.ravel().astype(np.int64, casting="unsafe")) def predict_proba(self, X, queue=None): return super()._predict_proba(X, _backend.decision_forest.classification, queue) class RandomForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='best', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="best", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -552,49 +583,53 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = False def fit(self, X, y, sample_weight=None, queue=None): if sample_weight is not None: - if hasattr(sample_weight, '__array__'): + if hasattr(sample_weight, "__array__"): sample_weight[sample_weight == 0.0] = 1.0 sample_weight = [sample_weight] - return super()._fit(X, y, sample_weight, - _backend.decision_forest.regression, queue) + return super()._fit( + X, y, sample_weight, _backend.decision_forest.regression, queue + ) def predict(self, X, queue=None): return super()._predict(X, _backend.decision_forest.regression, queue).ravel() class ExtraTreesClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='random', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="random", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -620,12 +655,14 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = True def _validate_targets(self, y, dtype): y, self.class_weight_, self.classes_ = _validate_targets( - y, self.class_weight, dtype) + y, self.class_weight, dtype + ) # Decapsulate classes_ attributes # TODO: @@ -635,50 +672,49 @@ def _validate_targets(self, y, dtype): return y def fit(self, X, y, sample_weight=None, queue=None): - return self._fit(X, y, sample_weight, - _backend.decision_forest.classification, queue) + return self._fit( + X, y, sample_weight, _backend.decision_forest.classification, queue + ) def predict(self, X, queue=None): pred = super()._predict(X, _backend.decision_forest.classification, queue) - return np.take( - self.classes_, - pred.ravel().astype( - np.int64, - casting='unsafe')) + return np.take(self.classes_, pred.ravel().astype(np.int64, casting="unsafe")) def predict_proba(self, X, queue=None): return super()._predict_proba(X, _backend.decision_forest.classification, queue) class ExtraTreesRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): - def __init__(self, - n_estimators=100, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - random_state=None, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - infer_mode='class_responses', - splitter_mode='random', - voting_mode='weighted', - error_metric_mode='none', - variable_importance_mode='none', - algorithm='hist', - **kwargs): + def __init__( + self, + n_estimators=100, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + random_state=None, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + infer_mode="class_responses", + splitter_mode="random", + voting_mode="weighted", + error_metric_mode="none", + variable_importance_mode="none", + algorithm="hist", + **kwargs, + ): super().__init__( n_estimators=n_estimators, criterion=criterion, @@ -704,16 +740,18 @@ def __init__(self, voting_mode=voting_mode, error_metric_mode=error_metric_mode, variable_importance_mode=variable_importance_mode, - algorithm=algorithm) + algorithm=algorithm, + ) self.is_classification = False def fit(self, X, y, sample_weight=None, queue=None): if sample_weight is not None: - if hasattr(sample_weight, '__array__'): + if hasattr(sample_weight, "__array__"): sample_weight[sample_weight == 0.0] = 1.0 sample_weight = [sample_weight] - return super()._fit(X, y, sample_weight, - _backend.decision_forest.regression, queue) + return super()._fit( + X, y, sample_weight, _backend.decision_forest.regression, queue + ) def predict(self, X, queue=None): return super()._predict(X, _backend.decision_forest.regression, queue).ravel() diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 0a09c578c9..90604fd614 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -19,13 +19,15 @@ from .get_tree import get_tree_state_cls, get_tree_state_reg __all__ = [ - 'get_tree_state_cls', - 'get_tree_state_reg', - 'linear_kernel', - 'rbf_kernel', - 'poly_kernel', - 'sigmoid_kernel'] + "get_tree_state_cls", + "get_tree_state_reg", + "linear_kernel", + "rbf_kernel", + "poly_kernel", + "sigmoid_kernel", +] -if daal_check_version((2023, 'P', 301)): +if daal_check_version((2023, "P", 301)): from .get_tree import get_forest_state - __all__ += ['get_forest_state'] + + __all__ += ["get_forest_state"] diff --git a/scripts/build_backend.py b/scripts/build_backend.py index d390af1573..7d99d9c3fe 100755 --- a/scripts/build_backend.py +++ b/scripts/build_backend.py @@ -173,8 +173,7 @@ def custom_build_cmake_clib(iface, cxx=None, onedal_major_binary_version=1, no_d "-DPYTHON_LIBRARY_DIR=" + python_library_dir, "-DoneDAL_INCLUDE_DIRS=" + jp(os.environ['DALROOT'], 'include'), "-DoneDAL_LIBRARY_DIR=" + jp(os.environ['DALROOT'], 'lib', 'intel64'), - "-Dpybind11_DIR=" + pybind11.get_cmake_dir(), - "-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON" + "-Dpybind11_DIR=" + pybind11.get_cmake_dir() ] if dpctl_available: diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index ab8650c032..be7ebb7cdf 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -#=============================================================================== +# =============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,11 +13,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#=============================================================================== +# =============================================================================== from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, - make2d, PatchingConditionsChain, check_tree_nodes + daal_check_version, + sklearn_check_version, + make2d, + PatchingConditionsChain, + check_tree_nodes, ) import numpy as np @@ -40,7 +43,8 @@ check_is_fitted, check_consistent_length, check_array, - check_X_y) + check_X_y, +) from onedal.utils import _num_features, _num_samples @@ -57,10 +61,10 @@ from scipy import sparse as sp -if sklearn_check_version('1.2'): +if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval -if daal_check_version((2023, 'P', 301)): +if daal_check_version((2023, "P", 301)): from onedal.primitives import get_forest_state @@ -72,7 +76,7 @@ def _fit_proba(self, X, y, sample_weight=None, queue=None): # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue def _save_attributes(self): self._onedal_model = self._onedal_estimator._onedal_model @@ -82,8 +86,9 @@ def _save_attributes(self): if hasattr(self._onedal_estimator, "oob_prediction_"): self.oob_prediction_ = self._onedal_estimator.oob_prediction_ if hasattr(self._onedal_estimator, "oob_decision_function_"): - self.oob_decision_function_ = \ + self.oob_decision_function_ = ( self._onedal_estimator.oob_decision_function_ + ) return self def _onedal_classifier(self, **onedal_params): @@ -95,69 +100,79 @@ def _onedal_regressor(self, **onedal_params): # TODO: # move to onedal modul. def _check_parameters(self): - if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if getattr(self, "min_impurity_split", None) is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if getattr(self, "min_impurity_split") < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if getattr(self, "min_impurity_split") < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def check_sample_weight(self, sample_weight, X, dtype=None): n_samples = _num_samples(X) @@ -177,49 +192,55 @@ def check_sample_weight(self, sample_weight, X, dtype=None): accept_sparse=False, ensure_2d=False, dtype=dtype, - order="C") + order="C", + ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight class ExtraTreesClassifier(sklearn_ExtraTreesClassifier, BaseTree): __doc__ = sklearn_ExtraTreesClassifier.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_ExtraTreesClassifier._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], - "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")] + "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -236,7 +257,7 @@ def __init__( random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight + class_weight=class_weight, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -245,28 +266,31 @@ def __init__( self.min_bin_size = min_bin_size else: - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -286,7 +310,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -320,17 +344,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_ExtraTreesClassifier.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_ExtraTreesClassifier.fit, + }, + X, + y, + sample_weight, + ) return self def _onedal_fit_ready(self, patching_status, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() @@ -338,22 +367,33 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): self._check_parameters() if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") - - ready = patching_status.and_conditions([ - (self.oob_score and daal_check_version((2021, 'P', 500)) or not - self.oob_score, - "OOB score is only supported starting from 2021.5 version of oneDAL."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.ccp_alpha == 0.0, - f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), - (self.criterion == "gini", - f"'{self.criterion}' criterion is not supported. " - "Only 'gini' criterion is supported."), - (self.warm_start is False, "Warm start is not supported."), - (self.n_estimators <= 6024, "More than 6024 estimators is not supported.") - ]) + raise ValueError("Out of bag estimation only available" " if bootstrap=True") + + ready = patching_status.and_conditions( + [ + ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score, + "OOB score is only supported starting from 2021.5 version of oneDAL.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.ccp_alpha == 0.0, + f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported.", + ), + ( + self.criterion == "gini", + f"'{self.criterion}' criterion is not supported. " + "Only 'gini' criterion is supported.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + self.n_estimators <= 6024, + "More than 6024 estimators is not supported.", + ), + ] + ) if ready: if sklearn_check_version("1.0"): @@ -367,22 +407,29 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) check_consistent_length(X, y) y = make2d(y) self.n_outputs_ = y.shape[1] - ready = patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - (y.dtype in [np.float32, np.float64, np.int32, np.int64], - f"Datatype ({y.dtype}) for y is not supported.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ( + y.dtype in [np.float32, np.float64, np.int32, np.int64], + f"Datatype ({y.dtype}) for y is not supported.", + ), + ] + ) # TODO: Fix to support integers as input n_samples = X.shape[0] if isinstance(self.max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= self.max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) @@ -391,9 +438,9 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): msg = "`max_samples` must be <= n_samples={} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) elif isinstance(self.max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(self.max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(self.max_samples)) @@ -436,10 +483,15 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_ExtraTreesClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_ExtraTreesClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): @@ -470,94 +522,102 @@ def predict_proba(self, X): # self._check_proba() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - if hasattr(self, 'n_features_in_'): + if hasattr(self, "n_features_in_"): try: num_features = _num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( - (f'X has {num_features} features, ' - f'but ExtraTreesClassifier is expecting ' - f'{self.n_features_in_} features as input')) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_ExtraTreesClassifier.predict_proba, - }, X) - - if sklearn_check_version('1.0'): + ( + f"X has {num_features} features, " + f"but ExtraTreesClassifier is expecting " + f"{self.n_features_in_} features as input" + ) + ) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_ExtraTreesClassifier.predict_proba, + }, + X, + ) + + if sklearn_check_version("1.0"): + @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = ExtraTreeClassifier(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution estimators_ = [] - + random_state_checked = check_random_state(self.random_state) - if daal_check_version((2023, 'P', 301)): + if daal_check_version((2023, "P", 301)): allstates = get_forest_state(self._onedal_model, n_classes_) - + for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - if daal_check_version((2023, 'P', 301)): + if daal_check_version((2023, "P", 301)): tree_i_state_dict = allstates[i] - tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) else: - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} - + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } + est_i.tree_ = Tree( self.n_features_in_, - np.array( - [n_classes_], - dtype=np.intp), - self.n_outputs_) + np.array([n_classes_], dtype=np.intp), + self.n_outputs_, + ) est_i.tree_.__setstate__(tree_state_dict) estimators_.append(est_i) @@ -567,48 +627,64 @@ def _estimators_(self): def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2"), - (not sp.issparse(sample_weight), "sample_weight is sparse. " - "Sparse input is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ( + not sp.issparse(sample_weight), + "sample_weight is sparse. " "Sparse input is not supported.", + ), + ] + ) - dal_ready = dal_ready and not hasattr(self, 'estimators_') + dal_ready = dal_ready and not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.2") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready = dal_ready and _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready = dal_ready and _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: dal_ready = False else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready @@ -616,62 +692,85 @@ def _onedal_cpu_supported(self, method_name, *data): def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1"), - (sample_weight is not None, "sample_weight is not supported.") - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + (sample_weight is not None, "sample_weight is not supported."), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = hasattr(self, '_onedal_model') and hasattr(self, 'n_outputs_') + dal_ready = hasattr(self, "_onedal_model") and hasattr(self, "n_outputs_") if dal_ready: - dal_ready = _patching_status.and_conditions([ - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1") - ]) + dal_ready = _patching_status.and_conditions( + [ + ( + not sp.issparse(X), + "X is sparse. Sparse input is not supported.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + ] + ) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready def _onedal_fit(self, X, y, sample_weight=None, queue=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data( - X, y, multi_output=False, accept_sparse=False, - dtype=[np.float64, np.float32] + X, + y, + multi_output=False, + accept_sparse=False, + dtype=[np.float64, np.float32], ) else: X, y = check_X_y( - X, y, accept_sparse=False, dtype=[np.float64, np.float32], - multi_output=False + X, + y, + accept_sparse=False, + dtype=[np.float64, np.float32], + multi_output=False, ) if sample_weight is not None: @@ -695,7 +794,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: @@ -707,43 +806,42 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): sample_weight = [sample_weight] if n_classes_ < 2: - raise ValueError( - "Training data only contain information about one class.") + raise ValueError("Training data only contain information about one class.") if self.oob_score: - err = 'out_of_bag_error_accuracy|out_of_bag_error_decision_function' + err = "out_of_bag_error_accuracy|out_of_bag_error_decision_function" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': self.random_state, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'class_weight': self.class_weight, - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": self.random_state, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "class_weight": self.class_weight, + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = "random" - if not sklearn_check_version('1.0'): - onedal_params['min_impurity_split'] = self.min_impurity_split + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = "random" + if not sklearn_check_version("1.0"): + onedal_params["min_impurity_split"] = self.min_impurity_split else: - onedal_params['min_impurity_split'] = None + onedal_params["min_impurity_split"] = None self._cached_estimators_ = None # Compute @@ -766,13 +864,12 @@ def _onedal_predict(self, X, queue=None): self._check_feature_names(X, reset=False) res = self._onedal_estimator.predict(X, queue=queue) - return np.take(self.classes_, - res.ravel().astype(np.int64, casting='unsafe')) + return np.take(self.classes_, res.ravel().astype(np.int64, casting="unsafe")) def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -782,36 +879,38 @@ def _onedal_predict_proba(self, X, queue=None): class ExtraTreesRegressor(sklearn_ExtraTreesRegressor, BaseTree): __doc__ = sklearn_ExtraTreesRegressor.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_ExtraTreesRegressor._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], - "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")] + "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1): + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -827,36 +926,40 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start + warm_start=warm_start, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha self.max_samples = max_samples self.max_bins = max_bins self.min_bin_size = min_bin_size + else: - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1 - ): + + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + ): super(ExtraTreesRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -875,7 +978,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -885,62 +988,61 @@ def __init__(self, @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = ExtraTreeRegressor(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - if daal_check_version((2023, 'P', 301)): + if daal_check_version((2023, "P", 301)): allstates = get_forest_state(self._onedal_model) - + for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - if daal_check_version((2023, 'P', 301)): + if daal_check_version((2023, "P", 301)): tree_i_state_dict = allstates[i] - tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) else: - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( - self.n_features_in_, np.array( - [1], dtype=np.intp), self.n_outputs_) + self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -948,9 +1050,7 @@ def _estimators_(self): def _onedal_fit_ready(self, patching_status, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() @@ -958,30 +1058,41 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): self._check_parameters() if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") - if sklearn_check_version('1.0') and self.criterion == "mse": + if sklearn_check_version("1.0") and self.criterion == "mse": warnings.warn( "Criterion 'mse' was deprecated in v1.0 and will be " "removed in version 1.2. Use `criterion='squared_error'` " "which is equivalent.", - FutureWarning + FutureWarning, ) - ready = patching_status.and_conditions([ - (self.oob_score and daal_check_version((2021, 'P', 500)) or not - self.oob_score, - "OOB score is only supported starting from 2021.5 version of oneDAL."), - (self.warm_start is False, "Warm start is not supported."), - (self.criterion in ["mse", "squared_error"], - f"'{self.criterion}' criterion is not supported. " - "Only 'mse' and 'squared_error' criteria are supported."), - (self.ccp_alpha == 0.0, - f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.n_estimators <= 6024, "More than 6024 estimators is not supported.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score, + "OOB score is only supported starting from 2021.5 version of oneDAL.", + ), + (self.warm_start is False, "Warm start is not supported."), + ( + self.criterion in ["mse", "squared_error"], + f"'{self.criterion}' criterion is not supported. " + "Only 'mse' and 'squared_error' criteria are supported.", + ), + ( + self.ccp_alpha == 0.0, + f"Non-zero 'ccp_alpha' ({self.ccp_alpha}) is not supported.", + ), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + ( + self.n_estimators <= 6024, + "More than 6024 estimators is not supported.", + ), + ] + ) if ready: if sklearn_check_version("1.0"): @@ -991,10 +1102,13 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: - warnings.warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2) + warnings.warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) @@ -1005,14 +1119,18 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): y = np.reshape(y, (-1, 1)) self.n_outputs_ = y.shape[1] - ready = patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1.") - ]) + ready = patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ) + ] + ) n_samples = X.shape[0] if isinstance(self.max_samples, numbers.Integral): - if not sklearn_check_version('1.2'): + if not sklearn_check_version("1.2"): if not (1 <= self.max_samples <= n_samples): msg = "`max_samples` must be in range 1 to {} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) @@ -1021,9 +1139,9 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): msg = "`max_samples` must be <= n_samples={} but got value {}" raise ValueError(msg.format(n_samples, self.max_samples)) elif isinstance(self.max_samples, numbers.Real): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): pass - elif sklearn_check_version('1.0'): + elif sklearn_check_version("1.0"): if not (0 < float(self.max_samples) <= 1): msg = "`max_samples` must be in range (0.0, 1.0] but got value {}" raise ValueError(msg.format(self.max_samples)) @@ -1047,48 +1165,64 @@ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2"), - (not sp.issparse(sample_weight), "sample_weight is sparse. " - "Sparse input is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ( + not sp.issparse(sample_weight), + "sample_weight is sparse. " "Sparse input is not supported.", + ), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - (daal_check_version((2023, 'P', 200)), - "ExtraTrees only supported starting from oneDAL version 2023.2") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 200)), + "ExtraTrees only supported starting from oneDAL version 2023.2", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: dal_ready = False else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready @@ -1096,55 +1230,66 @@ def _onedal_cpu_supported(self, method_name, *data): def _onedal_gpu_supported(self, method_name, *data): class_name = self.__class__.__name__ _patching_status = PatchingConditionsChain( - f'sklearn.ensemble.{class_name}.{method_name}') + f"sklearn.ensemble.{class_name}.{method_name}" + ) - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_fit_ready(_patching_status, *data) - dal_ready = ready and _patching_status.and_conditions([ - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1"), - (sample_weight is not None, "sample_weight is not supported."), - ]) + dal_ready = ready and _patching_status.and_conditions( + [ + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + (sample_weight is not None, "sample_weight is not supported."), + ] + ) - dal_ready &= not hasattr(self, 'estimators_') + dal_ready &= not hasattr(self, "estimators_") if dal_ready and (self.random_state is not None): - warnings.warn("Setting 'random_state' value is not supported. " - "State set by oneDAL to default value (777).", - RuntimeWarning) - - elif method_name in ['predict', - 'predict_proba']: + warnings.warn( + "Setting 'random_state' value is not supported. " + "State set by oneDAL to default value (777).", + RuntimeWarning, + ) + elif method_name in ["predict", "predict_proba"]: X = data[0] - dal_ready = _patching_status.and_conditions([ - (hasattr(self, '_onedal_model'), "oneDAL model was not trained."), - (not sp.issparse(X), "X is sparse. Sparse input is not supported."), - (self.warm_start is False, "Warm start is not supported."), - - (daal_check_version((2023, 'P', 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1") - ]) - if hasattr(self, 'n_outputs_'): - dal_ready &= _patching_status.and_conditions([ - (self.n_outputs_ == 1, - f"Number of outputs ({self.n_outputs_}) is not 1."), - ]) + dal_ready = _patching_status.and_conditions( + [ + (hasattr(self, "_onedal_model"), "oneDAL model was not trained."), + (not sp.issparse(X), "X is sparse. Sparse input is not supported."), + (self.warm_start is False, "Warm start is not supported."), + ( + daal_check_version((2023, "P", 100)), + "ExtraTrees only supported starting from oneDAL version 2023.1", + ), + ] + ) + if hasattr(self, "n_outputs_"): + dal_ready &= _patching_status.and_conditions( + [ + ( + self.n_outputs_ == 1, + f"Number of outputs ({self.n_outputs_}) is not 1.", + ), + ] + ) else: raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + f"Unknown method {method_name} in {self.__class__.__name__}" + ) _patching_status.write_log() return dal_ready def _onedal_fit(self, X, y, sample_weight=None, queue=None): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() else: @@ -1161,41 +1306,42 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if self.oob_score: - err = 'out_of_bag_error_r2|out_of_bag_error_prediction' + err = "out_of_bag_error_r2|out_of_bag_error_prediction" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': rs_, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": rs_, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = "random" + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = "random" self._cached_estimators_ = None self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) @@ -1238,10 +1384,17 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_ExtraTreesRegressor.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_ExtraTreesRegressor.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -1266,15 +1419,22 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_ExtraTreesRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_ExtraTreesRegressor.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 8e52732bb2..2316e3fcb1 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -16,8 +16,10 @@ # =============================================================================== from daal4py.sklearn._utils import ( - daal_check_version, sklearn_check_version, - make2d, check_tree_nodes + daal_check_version, + sklearn_check_version, + make2d, + check_tree_nodes, ) import numpy as np @@ -40,7 +42,8 @@ check_is_fitted, check_consistent_length, check_array, - check_X_y) + check_X_y, +) from onedal.utils import _num_features, _num_samples @@ -57,10 +60,10 @@ from scipy import sparse as sp -if sklearn_check_version('1.2'): +if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval, StrOptions -if daal_check_version((2023, 'P', 301)): +if daal_check_version((2023, "P", 301)): from onedal.primitives import get_forest_state @@ -72,7 +75,7 @@ def _fit_proba(self, X, y, sample_weight=None, queue=None): # We use stock metaestimators below, so the only way # to pass a queue is using config_context. cfg = get_config() - cfg['target_offload'] = queue + cfg["target_offload"] = queue def _save_attributes(self): self._onedal_model = self._onedal_estimator._onedal_model @@ -100,66 +103,77 @@ def _check_parameters(self): ) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if self.min_impurity_split is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if self.min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if self.min_impurity_split < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) if self.max_leaf_nodes is not None: if not isinstance(self.max_leaf_nodes, numbers.Integral): raise ValueError( "max_leaf_nodes must be integral number but was " - "%r" % - self.max_leaf_nodes) + "%r" % self.max_leaf_nodes + ) if self.max_leaf_nodes < 2: raise ValueError( - ("max_leaf_nodes {0} must be either None " - "or larger than 1").format( - self.max_leaf_nodes)) + ("max_leaf_nodes {0} must be either None " "or larger than 1").format( + self.max_leaf_nodes + ) + ) if isinstance(self.max_bins, numbers.Integral): if not 2 <= self.max_bins: - raise ValueError("max_bins must be at least 2, got %s" - % self.max_bins) + raise ValueError("max_bins must be at least 2, got %s" % self.max_bins) else: - raise ValueError("max_bins must be integral number but was " - "%r" % self.max_bins) + raise ValueError( + "max_bins must be integral number but was " "%r" % self.max_bins + ) if isinstance(self.min_bin_size, numbers.Integral): if not 1 <= self.min_bin_size: - raise ValueError("min_bin_size must be at least 1, got %s" - % self.min_bin_size) + raise ValueError( + "min_bin_size must be at least 1, got %s" % self.min_bin_size + ) else: - raise ValueError("min_bin_size must be integral number but was " - "%r" % self.min_bin_size) + raise ValueError( + "min_bin_size must be integral number but was " "%r" % self.min_bin_size + ) def check_sample_weight(self, sample_weight, X, dtype=None): n_samples = _num_samples(X) @@ -179,51 +193,57 @@ def check_sample_weight(self, sample_weight, X, dtype=None): accept_sparse=False, ensure_2d=False, dtype=dtype, - order="C") + order="C", + ) if sample_weight.ndim != 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight.shape != (n_samples,): - raise ValueError("sample_weight.shape == {}, expected {}!" - .format(sample_weight.shape, (n_samples,))) + raise ValueError( + "sample_weight.shape == {}, expected {}!".format( + sample_weight.shape, (n_samples,) + ) + ) return sample_weight class RandomForestClassifier(sklearn_RandomForestClassifier, BaseRandomForest): __doc__ = sklearn_RandomForestClassifier.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_RandomForestClassifier._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], - "splitter_mode": [StrOptions({"best", "random"})] + "splitter_mode": [StrOptions({"best", "random"})], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features='sqrt' if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt" if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -240,7 +260,7 @@ def __init__( random_state=random_state, verbose=verbose, warm_start=warm_start, - class_weight=class_weight + class_weight=class_weight, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -250,30 +270,34 @@ def __init__( self.min_impurity_split = None self.splitter_mode = splitter_mode # self._estimator = DecisionTreeClassifier() + else: - def __init__(self, - n_estimators=100, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + + def __init__( + self, + n_estimators=100, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestClassifier, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -293,7 +317,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -330,17 +354,22 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_RandomForestClassifier.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_RandomForestClassifier.fit, + }, + X, + y, + sample_weight, + ) return self def _onedal_ready(self, X, y, sample_weight): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if not self.bootstrap and self.max_samples is not None: raise ValueError( "`max_sample` cannot be set if `bootstrap=False`. " @@ -348,8 +377,7 @@ def _onedal_ready(self, X, y, sample_weight): "`max_sample=None`." ) if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError("Out of bag estimation only available" " if bootstrap=True") if sklearn_check_version("1.2"): self._validate_params() else: @@ -360,16 +388,20 @@ def _onedal_ready(self, X, y, sample_weight): correct_criterion = self.criterion == "gini" correct_warm_start = self.warm_start is False - if daal_check_version((2021, 'P', 500)): + if daal_check_version((2021, "P", 500)): correct_oob_score = not self.oob_score else: correct_oob_score = self.oob_score - ready = all([correct_oob_score, - correct_sparsity, - correct_ccp_alpha, - correct_criterion, - correct_warm_start]) + ready = all( + [ + correct_oob_score, + correct_sparsity, + correct_ccp_alpha, + correct_criterion, + correct_warm_start, + ] + ) if ready: if sklearn_check_version("1.0"): self._check_feature_names(X, reset=True) @@ -382,7 +414,8 @@ def _onedal_ready(self, X, y, sample_weight): " expected. Please change the shape of y to " "(n_samples,), for example using ravel().", DataConversionWarning, - stacklevel=2) + stacklevel=2, + ) check_consistent_length(X, y) y = make2d(y) @@ -415,10 +448,15 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_RandomForestClassifier.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_RandomForestClassifier.predict, + }, + X, + ) @wrap_output_data def predict_proba(self, X): @@ -449,93 +487,101 @@ def predict_proba(self, X): # self._check_proba() if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - if hasattr(self, 'n_features_in_'): + if hasattr(self, "n_features_in_"): try: num_features = _num_features(X) except TypeError: num_features = _num_samples(X) if num_features != self.n_features_in_: raise ValueError( - (f'X has {num_features} features, ' - f'but RandomForestClassifier is expecting ' - f'{self.n_features_in_} features as input')) - return dispatch(self, 'predict_proba', { - 'onedal': self.__class__._onedal_predict_proba, - 'sklearn': sklearn_RandomForestClassifier.predict_proba, - }, X) - - if sklearn_check_version('1.0'): + ( + f"X has {num_features} features, " + f"but RandomForestClassifier is expecting " + f"{self.n_features_in_} features as input" + ) + ) + return dispatch( + self, + "predict_proba", + { + "onedal": self.__class__._onedal_predict_proba, + "sklearn": sklearn_RandomForestClassifier.predict_proba, + }, + X, + ) + + if sklearn_check_version("1.0"): + @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = DecisionTreeClassifier(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - if daal_check_version((2023, 'P', 301)): + if daal_check_version((2023, "P", 301)): allstates = get_forest_state(self._onedal_model, n_classes_) for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - if daal_check_version((2023, 'P', 301)): + if daal_check_version((2023, "P", 301)): tree_i_state_dict = allstates[i] - tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) else: - tree_i_state_class = get_tree_state_cls( - self._onedal_model, i, n_classes_) + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, - np.array( - [n_classes_], - dtype=np.intp), - self.n_outputs_) + np.array([n_classes_], dtype=np.intp), + self.n_outputs_, + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -543,13 +589,16 @@ def _estimators_(self): return estimators_ def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random': - warnings.warn("'random' splitter mode supports GPU devices only " - "and requires oneDAL version >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random": + warnings.warn( + "'random' splitter mode supports GPU devices only " + "and requires oneDAL version >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False elif sp.issparse(X): @@ -562,39 +611,42 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif self.warm_start: return False - elif self.oob_score and not daal_check_version((2023, 'P', 101)): + elif self.oob_score and not daal_check_version((2023, "P", 101)): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name in ['predict', 'predict_proba']: + if method_name in ["predict", "predict_proba"]: X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random' and \ - not daal_check_version((2023, 'P', 101)): - warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random" and not daal_check_version( + (2023, "P", 101) + ): + warnings.warn( + "'random' splitter mode requires OneDAL >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False elif sp.issparse(X): @@ -613,37 +665,42 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name in ['predict', 'predict_proba']: + if method_name in ["predict", "predict_proba"]: X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): X, y = self._validate_data( - X, y, multi_output=False, accept_sparse=False, - dtype=[np.float64, np.float32] + X, + y, + multi_output=False, + accept_sparse=False, + dtype=[np.float64, np.float32], ) else: X, y = check_X_y( - X, y, accept_sparse=False, dtype=[np.float64, np.float32], - multi_output=False + X, + y, + accept_sparse=False, + dtype=[np.float64, np.float32], + multi_output=False, ) if sample_weight is not None: @@ -667,7 +724,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): n_classes_ = self.n_classes_[0] self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ if expanded_class_weight is not None: @@ -679,40 +736,39 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): sample_weight = [sample_weight] if n_classes_ < 2: - raise ValueError( - "Training data only contain information about one class.") + raise ValueError("Training data only contain information about one class.") if self.oob_score: - err = 'out_of_bag_error_accuracy|out_of_bag_error_decision_function' + err = "out_of_bag_error_accuracy|out_of_bag_error_decision_function" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'min_impurity_split': self.min_impurity_split, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': self.random_state, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'class_weight': self.class_weight, - 'max_bins': self.max_bins, - 'min_bin_size': self.min_bin_size, - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "min_impurity_split": self.min_impurity_split, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": self.random_state, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "class_weight": self.class_weight, + "max_bins": self.max_bins, + "min_bin_size": self.min_bin_size, + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode self._cached_estimators_ = None # Compute @@ -735,13 +791,12 @@ def _onedal_predict(self, X, queue=None): self._check_feature_names(X, reset=False) res = self._onedal_estimator.predict(X, queue=queue) - return np.take(self.classes_, - res.ravel().astype(np.int64, casting='unsafe')) + return np.take(self.classes_, res.ravel().astype(np.int64, casting="unsafe")) def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) check_is_fitted(self) - if sklearn_check_version('0.23'): + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -751,38 +806,40 @@ def _onedal_predict_proba(self, X, queue=None): class RandomForestRegressor(sklearn_RandomForestRegressor, BaseRandomForest): __doc__ = sklearn_RandomForestRegressor.__doc__ - if sklearn_check_version('1.2'): + if sklearn_check_version("1.2"): _parameter_constraints: dict = { **sklearn_RandomForestRegressor._parameter_constraints, "max_bins": [Interval(numbers.Integral, 2, None, closed="left")], "min_bin_size": [Interval(numbers.Integral, 1, None, closed="left")], - "splitter_mode": [StrOptions({"best", "random"})] + "splitter_mode": [StrOptions({"best", "random"})], } - if sklearn_check_version('1.0'): + if sklearn_check_version("1.0"): + def __init__( - self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=1.0 if sklearn_check_version('1.1') else 'auto', - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=1.0 if sklearn_check_version("1.1") else "auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -798,7 +855,7 @@ def __init__( n_jobs=n_jobs, random_state=random_state, verbose=verbose, - warm_start=warm_start + warm_start=warm_start, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -807,29 +864,34 @@ def __init__( self.min_bin_size = min_bin_size self.min_impurity_split = None self.splitter_mode = splitter_mode + else: - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - max_bins=256, - min_bin_size=1, - splitter_mode='best'): + + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + max_bins=256, + min_bin_size=1, + splitter_mode="best", + ): super(RandomForestRegressor, self).__init__( n_estimators=n_estimators, criterion=criterion, @@ -848,7 +910,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, ccp_alpha=ccp_alpha, - max_samples=max_samples + max_samples=max_samples, ) self.warm_start = warm_start self.ccp_alpha = ccp_alpha @@ -860,27 +922,27 @@ def __init__(self, @property def _estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_: return self._cached_estimators_ - if sklearn_check_version('0.22'): + if sklearn_check_version("0.22"): check_is_fitted(self) else: - check_is_fitted(self, '_onedal_model') + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'random_state': None, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "random_state": None, } - if not sklearn_check_version('1.0'): - params['min_impurity_split'] = self.min_impurity_split + if not sklearn_check_version("1.0"): + params["min_impurity_split"] = self.min_impurity_split est = DecisionTreeRegressor(**params) # we need to set est.tree_ field with Trees constructed from Intel(R) # oneAPI Data Analytics Library solution @@ -891,30 +953,29 @@ def _estimators_(self): for i in range(self.n_estimators): est_i = clone(est) est_i.set_params( - random_state=random_state_checked.randint( - np.iinfo( - np.int32).max)) - if sklearn_check_version('1.0'): + random_state=random_state_checked.randint(np.iinfo(np.int32).max) + ) + if sklearn_check_version("1.0"): est_i.n_features_in_ = self.n_features_in_ else: est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - if daal_check_version((2023, 'P', 301)): + if daal_check_version((2023, "P", 301)): tree_i_state_dict = allstates[i] - tree_i_state_dict['nodes'] = check_tree_nodes(tree_i_state_dict['nodes']) + tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) else: - tree_i_state_class = get_tree_state_reg( - self._onedal_model, i) + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) tree_i_state_dict = { - 'max_depth': tree_i_state_class.max_depth, - 'node_count': tree_i_state_class.node_count, - 'nodes': check_tree_nodes(tree_i_state_class.node_ar), - 'values': tree_i_state_class.value_ar} + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( - self.n_features_in_, np.array( - [1], dtype=np.intp), self.n_outputs_) + self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ + ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) @@ -937,17 +998,23 @@ def _onedal_ready(self, X, y, sample_weight): return ready, X, y, sample_weight def _onedal_cpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random': - warnings.warn("'random' splitter mode supports GPU devices only " - "and requires oneDAL version >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random": + warnings.warn( + "'random' splitter mode supports GPU devices only " + "and requires oneDAL version >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False - elif not (self.oob_score and daal_check_version( - (2021, 'P', 500)) or not self.oob_score): + elif not ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score + ): return False elif self.criterion not in ["mse", "squared_error"]: return False @@ -961,42 +1028,48 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif self.warm_start: return False - elif self.oob_score and not daal_check_version((2023, 'P', 101)): + elif self.oob_score and not daal_check_version((2023, "P", 101)): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name == 'predict': - if not hasattr(self, '_onedal_model'): + if method_name == "predict": + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(data[0]): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_gpu_supported(self, method_name, *data): - if method_name == 'fit': + if method_name == "fit": ready, X, y, sample_weight = self._onedal_ready(*data) - if self.splitter_mode == 'random' and \ - not daal_check_version((2023, 'P', 101)): - warnings.warn("'random' splitter mode requires OneDAL >= 2023.1.1. " - "Using 'best' mode instead.", RuntimeWarning) - self.splitter_mode = 'best' + if self.splitter_mode == "random" and not daal_check_version( + (2023, "P", 101) + ): + warnings.warn( + "'random' splitter mode requires OneDAL >= 2023.1.1. " + "Using 'best' mode instead.", + RuntimeWarning, + ) + self.splitter_mode = "best" if not ready: return False - elif not (self.oob_score and daal_check_version( - (2021, 'P', 500)) or not self.oob_score): + elif not ( + self.oob_score + and daal_check_version((2021, "P", 500)) + or not self.oob_score + ): return False elif self.criterion not in ["mse", "squared_error"]: return False @@ -1012,32 +1085,29 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif self.oob_score: return False - elif hasattr(self, 'estimators_'): + elif hasattr(self, "estimators_"): return False else: return True - if method_name == 'predict': + if method_name == "predict": X = data[0] - if not hasattr(self, '_onedal_model'): + if not hasattr(self, "_onedal_model"): return False elif sp.issparse(X): return False - elif not (hasattr(self, 'n_outputs_') and self.n_outputs_ == 1): + elif not (hasattr(self, "n_outputs_") and self.n_outputs_ == 1): return False - elif not daal_check_version((2021, 'P', 400)): + elif not daal_check_version((2021, "P", 400)): return False elif self.warm_start: return False else: return True - raise RuntimeError( - f'Unknown method {method_name} in {self.__class__.__name__}') + raise RuntimeError(f"Unknown method {method_name} in {self.__class__.__name__}") def _onedal_fit(self, X, y, sample_weight=None, queue=None): if sp.issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) + raise ValueError("sparse multilabel-indicator for y is not supported.") if sklearn_check_version("1.2"): self._validate_params() else: @@ -1051,37 +1121,37 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): y = check_array(y, ensure_2d=False, dtype=X.dtype) check_consistent_length(X, y) self.n_features_in_ = X.shape[1] - if not sklearn_check_version('1.0'): + if not sklearn_check_version("1.0"): self.n_features_ = self.n_features_in_ rs_ = check_random_state(self.random_state) if self.oob_score: - err = 'out_of_bag_error_r2|out_of_bag_error_prediction' + err = "out_of_bag_error_r2|out_of_bag_error_prediction" else: - err = 'none' + err = "none" onedal_params = { - 'n_estimators': self.n_estimators, - 'criterion': self.criterion, - 'max_depth': self.max_depth, - 'min_samples_split': self.min_samples_split, - 'min_samples_leaf': self.min_samples_leaf, - 'min_weight_fraction_leaf': self.min_weight_fraction_leaf, - 'max_features': self.max_features, - 'max_leaf_nodes': self.max_leaf_nodes, - 'min_impurity_decrease': self.min_impurity_decrease, - 'bootstrap': self.bootstrap, - 'oob_score': self.oob_score, - 'n_jobs': self.n_jobs, - 'random_state': rs_, - 'verbose': self.verbose, - 'warm_start': self.warm_start, - 'error_metric_mode': err, - 'variable_importance_mode': 'mdi', - 'max_samples': self.max_samples + "n_estimators": self.n_estimators, + "criterion": self.criterion, + "max_depth": self.max_depth, + "min_samples_split": self.min_samples_split, + "min_samples_leaf": self.min_samples_leaf, + "min_weight_fraction_leaf": self.min_weight_fraction_leaf, + "max_features": self.max_features, + "max_leaf_nodes": self.max_leaf_nodes, + "min_impurity_decrease": self.min_impurity_decrease, + "bootstrap": self.bootstrap, + "oob_score": self.oob_score, + "n_jobs": self.n_jobs, + "random_state": rs_, + "verbose": self.verbose, + "warm_start": self.warm_start, + "error_metric_mode": err, + "variable_importance_mode": "mdi", + "max_samples": self.max_samples, } - if daal_check_version((2023, 'P', 101)): - onedal_params['splitter_mode'] = self.splitter_mode + if daal_check_version((2023, "P", 101)): + onedal_params["splitter_mode"] = self.splitter_mode self._cached_estimators_ = None self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) @@ -1130,10 +1200,17 @@ def fit(self, X, y, sample_weight=None): "Either switch to `bootstrap=True` or set " "`max_sample=None`." ) - dispatch(self, 'fit', { - 'onedal': self.__class__._onedal_fit, - 'sklearn': sklearn_RandomForestRegressor.fit, - }, X, y, sample_weight) + dispatch( + self, + "fit", + { + "onedal": self.__class__._onedal_fit, + "sklearn": sklearn_RandomForestRegressor.fit, + }, + X, + y, + sample_weight, + ) return self @wrap_output_data @@ -1158,15 +1235,22 @@ def predict(self, X): y : ndarray of shape (n_samples,) or (n_samples, n_outputs) The predicted classes. """ - return dispatch(self, 'predict', { - 'onedal': self.__class__._onedal_predict, - 'sklearn': sklearn_RandomForestRegressor.predict, - }, X) + return dispatch( + self, + "predict", + { + "onedal": self.__class__._onedal_predict, + "sklearn": sklearn_RandomForestRegressor.predict, + }, + X, + ) + + if sklearn_check_version("1.0"): - if sklearn_check_version('1.0'): @deprecated( "Attribute `n_features_` was deprecated in version 1.0 and will be " - "removed in 1.2. Use `n_features_in_` instead.") + "removed in 1.2. Use `n_features_in_` instead." + ) @property def n_features_(self): return self.n_features_in_ From f3a07523f012c72a58d3cc75c16f7adaeca7fa7f Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 06:56:19 -0700 Subject: [PATCH 33/52] isorted --- onedal/ensemble/forest.py | 35 ++++++++-------- onedal/primitives/__init__.py | 3 +- scripts/build_backend.py | 2 +- sklearnex/preview/ensemble/extra_trees.py | 49 +++++++++-------------- sklearnex/preview/ensemble/forest.py | 47 +++++++++------------- 5 files changed, 57 insertions(+), 79 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 940500f36d..5632269f07 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -14,43 +14,42 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import daal_check_version, sklearn_check_version - -from abc import ABCMeta, abstractmethod import numbers -from numbers import Number import warnings +from abc import ABCMeta, abstractmethod +from math import ceil +from numbers import Number + +import numpy as np +from scipy import sparse as sp +from sklearn.ensemble import BaseEnsemble from sklearn.exceptions import DataConversionWarning from sklearn.utils import ( + check_array, check_random_state, compute_sample_weight, - check_array, deprecated, ) from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, _num_samples, + check_consistent_length, + check_is_fitted, ) -from math import ceil -import numpy as np -from scipy import sparse as sp +from daal4py.sklearn._utils import daal_check_version, sklearn_check_version +from onedal import _backend +from ..common._estimator_checks import _check_is_fitted from ..common._mixin import ClassifierMixin, RegressorMixin from ..common._policy import _get_policy -from ..common._estimator_checks import _check_is_fitted -from ..datatypes import from_table, to_table, _convert_to_supported +from ..datatypes import _convert_to_supported, from_table, to_table from ..utils import ( - _validate_targets, - _check_X_y, _check_array, - _column_or_1d, _check_n_features, + _check_X_y, + _column_or_1d, + _validate_targets, ) -from onedal import _backend - -from sklearn.ensemble import BaseEnsemble class BaseForest(BaseEnsemble, metaclass=ABCMeta): diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 90604fd614..544a8ad1e7 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -15,8 +15,9 @@ # =============================================================================== from daal4py.sklearn._utils import daal_check_version -from .kernel_functions import linear_kernel, rbf_kernel, poly_kernel, sigmoid_kernel + from .get_tree import get_tree_state_cls, get_tree_state_reg +from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel __all__ = [ "get_tree_state_cls", diff --git a/scripts/build_backend.py b/scripts/build_backend.py index 7d99d9c3fe..b89ae07653 100755 --- a/scripts/build_backend.py +++ b/scripts/build_backend.py @@ -173,7 +173,7 @@ def custom_build_cmake_clib(iface, cxx=None, onedal_major_binary_version=1, no_d "-DPYTHON_LIBRARY_DIR=" + python_library_dir, "-DoneDAL_INCLUDE_DIRS=" + jp(os.environ['DALROOT'], 'include'), "-DoneDAL_LIBRARY_DIR=" + jp(os.environ['DALROOT'], 'lib', 'intel64'), - "-Dpybind11_DIR=" + pybind11.get_cmake_dir() + "-Dpybind11_DIR=" + pybind11.get_cmake_dir(), ] if dpctl_available: diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index be7ebb7cdf..ce21fdb429 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -15,51 +15,40 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import ( - daal_check_version, - sklearn_check_version, - make2d, - PatchingConditionsChain, - check_tree_nodes, -) - -import numpy as np - import numbers - import warnings - from abc import ABC -from sklearn.exceptions import DataConversionWarning - -from ..._config import get_config -from ..._device_offload import dispatch, wrap_output_data - +import numpy as np +from scipy import sparse as sp +from sklearn.base import clone from sklearn.ensemble import ExtraTreesClassifier as sklearn_ExtraTreesClassifier from sklearn.ensemble import ExtraTreesRegressor as sklearn_ExtraTreesRegressor - +from sklearn.exceptions import DataConversionWarning +from sklearn.tree import ExtraTreeClassifier, ExtraTreeRegressor +from sklearn.tree._tree import Tree +from sklearn.utils import check_random_state, deprecated from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, check_array, + check_consistent_length, + check_is_fitted, check_X_y, ) -from onedal.utils import _num_features, _num_samples - -from sklearn.utils import check_random_state, deprecated - -from sklearn.base import clone - -from sklearn.tree import ExtraTreeClassifier, ExtraTreeRegressor -from sklearn.tree._tree import Tree - +from daal4py.sklearn._utils import ( + PatchingConditionsChain, + check_tree_nodes, + daal_check_version, + make2d, + sklearn_check_version, +) from onedal.ensemble import ExtraTreesClassifier as onedal_ExtraTreesClassifier from onedal.ensemble import ExtraTreesRegressor as onedal_ExtraTreesRegressor from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.utils import _num_features, _num_samples -from scipy import sparse as sp +from ..._config import get_config +from ..._device_offload import dispatch, wrap_output_data if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 2316e3fcb1..327c1419f2 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -15,50 +15,39 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import ( - daal_check_version, - sklearn_check_version, - make2d, - check_tree_nodes, -) - -import numpy as np - import numbers - import warnings - from abc import ABC -from sklearn.exceptions import DataConversionWarning - -from ..._config import get_config -from ..._device_offload import dispatch, wrap_output_data - +import numpy as np +from scipy import sparse as sp +from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier as sklearn_RandomForestClassifier from sklearn.ensemble import RandomForestRegressor as sklearn_RandomForestRegressor - +from sklearn.exceptions import DataConversionWarning +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.tree._tree import Tree +from sklearn.utils import check_random_state, deprecated from sklearn.utils.validation import ( - check_is_fitted, - check_consistent_length, check_array, + check_consistent_length, + check_is_fitted, check_X_y, ) -from onedal.utils import _num_features, _num_samples - -from sklearn.utils import check_random_state, deprecated - -from sklearn.base import clone - -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from sklearn.tree._tree import Tree - +from daal4py.sklearn._utils import ( + check_tree_nodes, + daal_check_version, + make2d, + sklearn_check_version, +) from onedal.ensemble import RandomForestClassifier as onedal_RandomForestClassifier from onedal.ensemble import RandomForestRegressor as onedal_RandomForestRegressor from onedal.primitives import get_tree_state_cls, get_tree_state_reg +from onedal.utils import _num_features, _num_samples -from scipy import sparse as sp +from ..._config import get_config +from ..._device_offload import dispatch, wrap_output_data if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval, StrOptions From 053633c3f8cddb5602e1d4ec707d7832c37cfe02 Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 06:57:28 -0700 Subject: [PATCH 34/52] black is beautiful --- onedal/primitives/get_tree.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onedal/primitives/get_tree.py b/onedal/primitives/get_tree.py index 202244496c..2d330f5093 100644 --- a/onedal/primitives/get_tree.py +++ b/onedal/primitives/get_tree.py @@ -19,15 +19,15 @@ def get_tree_state_cls(model, iTree, n_classes): - return _backend.get_tree.classification.get_tree_state( - model, iTree, n_classes) + return _backend.get_tree.classification.get_tree_state(model, iTree, n_classes) def get_tree_state_reg(model, iTree): return _backend.get_tree.regression.get_tree_state(model, iTree, 1) -if daal_check_version((2023, 'P', 301)): +if daal_check_version((2023, "P", 301)): + def get_forest_state(model, n_classes=None): if n_classes: return _backend.get_tree.classification.get_all_states(model, n_classes) From b13ec0941db030ceb9b333fb2ed04aa5d0a25fb7 Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 07:27:37 -0700 Subject: [PATCH 35/52] reduced text --- sklearnex/preview/ensemble/extra_trees.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index ce21fdb429..41f37127bd 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -607,7 +607,7 @@ def _estimators_(self): np.array([n_classes_], dtype=np.intp), self.n_outputs_, ) - est_i.tree_.__setstate__(tree_state_dict) + est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) self._cached_estimators_ = estimators_ @@ -721,7 +721,7 @@ def _onedal_gpu_supported(self, method_name, *data): (self.warm_start is False, "Warm start is not supported."), ( daal_check_version((2023, "P", 100)), - "ExtraTrees only supported starting from oneDAL version 2023.1", + "ExtraTrees supported starting from oneDAL version 2023.1", ), ] ) From 9bbe4bc3b0ffae78b605290257c26556e4550395 Mon Sep 17 00:00:00 2001 From: icfaust Date: Fri, 21 Jul 2023 09:11:16 -0700 Subject: [PATCH 36/52] forgotten version check --- sklearnex/preview/ensemble/forest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 327c1419f2..79440ed87e 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -937,7 +937,8 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - allstates = get_forest_state(self._onedal_model) + if daal_check_version((2023, "P", 301)): + allstates = get_forest_state(self._onedal_model) for i in range(self.n_estimators): est_i = clone(est) From a2633ddedd6af1c0130f823f66817c8dc8174a79 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 31 Jul 2023 02:42:19 -0700 Subject: [PATCH 37/52] correct memory leak --- onedal/primitives/tree_visitor.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 4b59c1bdbf..c987ef0d1e 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -191,14 +191,20 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t // array_t doesn't initialize the underlying memory with the object's constructor // so the values will not match what is defined above, must be done on C++ side - this->node_ar = py::array_t(node_ar_shape, node_ar_strides, this->node_ar_ptr, py::none()); - this->value_ar = py::array_t(value_ar_shape, value_ar_strides, this->value_ar_ptr, py::none()); - py::buffer_info node_ar_buf = this->node_ar.request(); - //this->node_ar_ptr = static_cast(node_ar_buf.ptr); - - py::buffer_info value_ar_buf = this->value_ar.request(); - //this->value_ar_ptr = static_cast(value_ar_buf.ptr); + py::capsule free_value_ar(this->value_ar_ptr, [](void* f){ + double *value_ar_ptr = reinterpret_cast(f); + delete[] value_ar_ptr; + }); + + py::capsule free_node_ar(this->node_ar_ptr, [](void* f){ + skl_tree_node *node_ar_ptr = reinterpret_cast(f); + delete[] node_ar_ptr; + }); + + this->node_ar = py::array_t(node_ar_shape, node_ar_strides, this->node_ar_ptr, free_node_ar); + this->value_ar = py::array_t(value_ar_shape, value_ar_strides, this->value_ar_ptr, free_value_ar); + } template From 7d4b141434d46656685136359c4755c38a52d80f Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 31 Jul 2023 06:00:13 -0700 Subject: [PATCH 38/52] moving to lazy evaluation --- onedal/primitives/__init__.py | 7 -- onedal/primitives/get_tree.py | 10 --- onedal/primitives/tree_visitor.cpp | 54 -------------- sklearnex/preview/ensemble/extra_trees.py | 88 ++++++++++------------- sklearnex/preview/ensemble/forest.py | 88 ++++++++++------------- 5 files changed, 76 insertions(+), 171 deletions(-) diff --git a/onedal/primitives/__init__.py b/onedal/primitives/__init__.py index 544a8ad1e7..a409999030 100644 --- a/onedal/primitives/__init__.py +++ b/onedal/primitives/__init__.py @@ -14,8 +14,6 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import daal_check_version - from .get_tree import get_tree_state_cls, get_tree_state_reg from .kernel_functions import linear_kernel, poly_kernel, rbf_kernel, sigmoid_kernel @@ -27,8 +25,3 @@ "poly_kernel", "sigmoid_kernel", ] - -if daal_check_version((2023, "P", 301)): - from .get_tree import get_forest_state - - __all__ += ["get_forest_state"] diff --git a/onedal/primitives/get_tree.py b/onedal/primitives/get_tree.py index 2d330f5093..9afd86624b 100644 --- a/onedal/primitives/get_tree.py +++ b/onedal/primitives/get_tree.py @@ -14,7 +14,6 @@ # limitations under the License. # =============================================================================== -from daal4py.sklearn._utils import daal_check_version from onedal import _backend @@ -24,12 +23,3 @@ def get_tree_state_cls(model, iTree, n_classes): def get_tree_state_reg(model, iTree): return _backend.get_tree.regression.get_tree_state(model, iTree, 1) - - -if daal_check_version((2023, "P", 301)): - - def get_forest_state(model, n_classes=None): - if n_classes: - return _backend.get_tree.classification.get_all_states(model, n_classes) - else: - return _backend.get_tree.regression.get_all_states(model, 1) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index c987ef0d1e..75932bc13c 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -204,7 +204,6 @@ to_sklearn_tree_object_visitor::to_sklearn_tree_object_visitor(std::size_t this->node_ar = py::array_t(node_ar_shape, node_ar_strides, this->node_ar_ptr, free_node_ar); this->value_ar = py::array_t(value_ar_shape, value_ar_strides, this->value_ar_ptr, free_value_ar); - } template @@ -289,55 +288,6 @@ bool to_sklearn_tree_object_visitor::call( return true; } -#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 -template -py::list get_all_states(const decision_forest::model& model, std::size_t n_classes) { - using ncv_dec = node_visitor>; - using tsv_dec = node_visitor>; - - std::size_t tree_count = model.get_tree_count(); - std::vector> ncvs(tree_count, node_count_visitor()); - std::vector ncv_decorators; - for(std::size_t i=0; i < tree_count; i++){ - ncv_decorators.push_back(ncv_dec{&ncvs[i]}); - } - - model.template traverse_depth_first, ncv_dec> (std::move(ncv_decorators)); - - // generate memory block here - py::list output; - - // this may be slow based on the memory allocation - std::vector> tsvs; - std::vector tsv_decorators; - for(std::size_t i=0; i < tree_count; i++){ - tsvs.push_back(to_sklearn_tree_object_visitor(ncvs[i].depth, - ncvs[i].n_nodes, - ncvs[i].n_leaf_nodes, - n_classes)); - } - // must be done separately due to the nature of the decorators and a constant pointer vs vector push back - for(std::size_t i=0; i < tree_count; i++){ - tsv_decorators.push_back(tsv_dec{&tsvs[i]}); - } - - model.template traverse_depth_first, tsv_dec>(std::move(tsv_decorators)); - - // create list here - for( std::size_t i=0; i < tree_count; i++){ - py::dict est_tree_state; - est_tree_state["max_depth"] = tsvs[i].max_depth; - est_tree_state["node_count"] = tsvs[i].node_count; - est_tree_state["nodes"] = tsvs[i].node_ar; - est_tree_state["values"] = tsvs[i].value_ar; - output.append(est_tree_state); - } - - return output; -} -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 - - template void init_get_tree_state(py::module_& m) { using namespace decision_forest; @@ -368,10 +318,6 @@ void init_get_tree_state(py::module_& m) { .def_readwrite("node_count", &tree_state_t::node_count) .def_readwrite("leaf_count", &tree_state_t::leaf_count) .def_readwrite("class_count", &tree_state_t::class_count); - -#if defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 - m.def("get_all_states", &get_all_states, py::return_value_policy::take_ownership); -#endif // defined(ONEDAL_VERSION) && ONEDAL_VERSION >= 20230301 } ONEDAL_PY_TYPE2STR(decision_forest::task::classification, "classification"); diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index 41f37127bd..c7fe867e68 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -53,9 +53,6 @@ if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval -if daal_check_version((2023, "P", 301)): - from onedal.primitives import get_forest_state - class BaseTree(ABC): def _fit_proba(self, X, y, sample_weight=None, queue=None): @@ -194,6 +191,20 @@ def check_sample_weight(self, sample_weight, X, dtype=None): ) return sample_weight + @property + def estimators_(self): + if hasattr(self, '_cached_estimators_'): + if self._cached_estimators_ is None and self._onedal_model: + self._estimators_() + return self._cached_estimators_ + else: + raise AttributeError(f"'{self.__class__.__name__}' has no attribute 'estimators_'") + + @estimators_.setter + def estimators_(self, estimators): + # Needed to allow for proper sklearn operation in fallback mode + self._cached_estimators_ = estimators + class ExtraTreesClassifier(sklearn_ExtraTreesClassifier, BaseTree): __doc__ = sklearn_ExtraTreesClassifier.__doc__ @@ -544,11 +555,7 @@ def predict_proba(self, X): def n_features_(self): return self.n_features_in_ - @property def _estimators_(self): - if hasattr(self, "_cached_estimators_"): - if self._cached_estimators_: - return self._cached_estimators_ if sklearn_check_version("0.22"): check_is_fitted(self) else: @@ -575,8 +582,6 @@ def _estimators_(self): estimators_ = [] random_state_checked = check_random_state(self.random_state) - if daal_check_version((2023, "P", 301)): - allstates = get_forest_state(self._onedal_model, n_classes_) for i in range(self.n_estimators): est_i = clone(est) @@ -590,17 +595,13 @@ def _estimators_(self): est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - if daal_check_version((2023, "P", 301)): - tree_i_state_dict = allstates[i] - tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) - else: - tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) - tree_i_state_dict = { - "max_depth": tree_i_state_class.max_depth, - "node_count": tree_i_state_class.node_count, - "nodes": check_tree_nodes(tree_i_state_class.node_ar), - "values": tree_i_state_class.value_ar, - } + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) + tree_i_state_dict = { + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, @@ -611,7 +612,6 @@ def _estimators_(self): estimators_.append(est_i) self._cached_estimators_ = estimators_ - return estimators_ def _onedal_cpu_supported(self, method_name, *data): class_name = self.__class__.__name__ @@ -635,8 +635,6 @@ def _onedal_cpu_supported(self, method_name, *data): ] ) - dal_ready = dal_ready and not hasattr(self, "estimators_") - if dal_ready and (self.random_state is not None): warnings.warn( "Setting 'random_state' value is not supported. " @@ -697,8 +695,6 @@ def _onedal_gpu_supported(self, method_name, *data): ] ) - dal_ready &= not hasattr(self, "estimators_") - if dal_ready and (self.random_state is not None): warnings.warn( "Setting 'random_state' value is not supported. " @@ -831,6 +827,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params["min_impurity_split"] = self.min_impurity_split else: onedal_params["min_impurity_split"] = None + + # Lazy evaluation of estimators_ self._cached_estimators_ = None # Compute @@ -840,7 +838,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): self._save_attributes() if sklearn_check_version("1.2"): self._estimator = ExtraTreeClassifier() - self.estimators_ = self._estimators_ + # Decapsulate classes_ attributes self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] @@ -975,11 +973,7 @@ def __init__( self.max_bins = max_bins self.min_bin_size = min_bin_size - @property def _estimators_(self): - if hasattr(self, "_cached_estimators_"): - if self._cached_estimators_: - return self._cached_estimators_ if sklearn_check_version("0.22"): check_is_fitted(self) else: @@ -1003,8 +997,6 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - if daal_check_version((2023, "P", 301)): - allstates = get_forest_state(self._onedal_model) for i in range(self.n_estimators): est_i = clone(est) @@ -1017,25 +1009,22 @@ def _estimators_(self): est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - if daal_check_version((2023, "P", 301)): - tree_i_state_dict = allstates[i] - tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) - else: - tree_i_state_class = get_tree_state_reg(self._onedal_model, i) - tree_i_state_dict = { - "max_depth": tree_i_state_class.max_depth, - "node_count": tree_i_state_class.node_count, - "nodes": check_tree_nodes(tree_i_state_class.node_ar), - "values": tree_i_state_class.value_ar, - } + + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) + tree_i_state_dict = { + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) - - return estimators_ + + self._cached_estimators_ = estimators_ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): if sp.issparse(y): @@ -1173,8 +1162,6 @@ def _onedal_cpu_supported(self, method_name, *data): ] ) - dal_ready &= not hasattr(self, "estimators_") - if dal_ready and (self.random_state is not None): warnings.warn( "Setting 'random_state' value is not supported. " @@ -1235,8 +1222,6 @@ def _onedal_gpu_supported(self, method_name, *data): ] ) - dal_ready &= not hasattr(self, "estimators_") - if dal_ready and (self.random_state is not None): warnings.warn( "Setting 'random_state' value is not supported. " @@ -1331,14 +1316,17 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): } if daal_check_version((2023, "P", 101)): onedal_params["splitter_mode"] = "random" + + # Lazy evaluation of estimators_ self._cached_estimators_ = None + self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) self._save_attributes() if sklearn_check_version("1.2"): self._estimator = ExtraTreeRegressor() - self.estimators_ = self._estimators_ + return self def _onedal_predict(self, X, queue=None): diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 79440ed87e..60006ba64d 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -52,9 +52,6 @@ if sklearn_check_version("1.2"): from sklearn.utils._param_validation import Interval, StrOptions -if daal_check_version((2023, "P", 301)): - from onedal.primitives import get_forest_state - class BaseRandomForest(ABC): def _fit_proba(self, X, y, sample_weight=None, queue=None): @@ -195,6 +192,20 @@ def check_sample_weight(self, sample_weight, X, dtype=None): ) return sample_weight + @property + def estimators_(self): + if hasattr(self, '_cached_estimators_'): + if self._cached_estimators_ is None and self._onedal_model: + self._estimators_() + return self._cached_estimators_ + else: + raise AttributeError(f"'{self.__class__.__name__}' has no attribute 'estimators_'") + + @estimators_.setter + def estimators_(self, estimators): + # Needed to allow for proper sklearn operation in fallback mode + self._cached_estimators_ = estimators + class RandomForestClassifier(sklearn_RandomForestClassifier, BaseRandomForest): __doc__ = sklearn_RandomForestClassifier.__doc__ @@ -509,11 +520,7 @@ def predict_proba(self, X): def n_features_(self): return self.n_features_in_ - @property def _estimators_(self): - if hasattr(self, "_cached_estimators_"): - if self._cached_estimators_: - return self._cached_estimators_ if sklearn_check_version("0.22"): check_is_fitted(self) else: @@ -539,8 +546,6 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - if daal_check_version((2023, "P", 301)): - allstates = get_forest_state(self._onedal_model, n_classes_) for i in range(self.n_estimators): est_i = clone(est) @@ -554,17 +559,13 @@ def _estimators_(self): est_i.n_outputs_ = self.n_outputs_ est_i.classes_ = classes_ est_i.n_classes_ = n_classes_ - if daal_check_version((2023, "P", 301)): - tree_i_state_dict = allstates[i] - tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) - else: - tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) - tree_i_state_dict = { - "max_depth": tree_i_state_class.max_depth, - "node_count": tree_i_state_class.node_count, - "nodes": check_tree_nodes(tree_i_state_class.node_ar), - "values": tree_i_state_class.value_ar, - } + tree_i_state_class = get_tree_state_cls(self._onedal_model, i, n_classes_) + tree_i_state_dict = { + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, @@ -575,7 +576,6 @@ def _estimators_(self): estimators_.append(est_i) self._cached_estimators_ = estimators_ - return estimators_ def _onedal_cpu_supported(self, method_name, *data): if method_name == "fit": @@ -604,8 +604,6 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, "estimators_"): - return False else: return True if method_name in ["predict", "predict_proba"]: @@ -654,8 +652,6 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, "estimators_"): - return False else: return True if method_name in ["predict", "predict_proba"]: @@ -758,6 +754,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): } if daal_check_version((2023, "P", 101)): onedal_params["splitter_mode"] = self.splitter_mode + + # Lazy evaluation of estimators_ self._cached_estimators_ = None # Compute @@ -767,7 +765,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): self._save_attributes() if sklearn_check_version("1.2"): self._estimator = DecisionTreeClassifier() - self.estimators_ = self._estimators_ + # Decapsulate classes_ attributes self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] @@ -909,11 +907,7 @@ def __init__( self.min_impurity_split = None self.splitter_mode = splitter_mode - @property def _estimators_(self): - if hasattr(self, "_cached_estimators_"): - if self._cached_estimators_: - return self._cached_estimators_ if sklearn_check_version("0.22"): check_is_fitted(self) else: @@ -937,8 +931,6 @@ def _estimators_(self): # oneAPI Data Analytics Library solution estimators_ = [] random_state_checked = check_random_state(self.random_state) - if daal_check_version((2023, "P", 301)): - allstates = get_forest_state(self._onedal_model) for i in range(self.n_estimators): est_i = clone(est) @@ -951,25 +943,21 @@ def _estimators_(self): est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ - if daal_check_version((2023, "P", 301)): - tree_i_state_dict = allstates[i] - tree_i_state_dict["nodes"] = check_tree_nodes(tree_i_state_dict["nodes"]) - else: - tree_i_state_class = get_tree_state_reg(self._onedal_model, i) - tree_i_state_dict = { - "max_depth": tree_i_state_class.max_depth, - "node_count": tree_i_state_class.node_count, - "nodes": check_tree_nodes(tree_i_state_class.node_ar), - "values": tree_i_state_class.value_ar, - } + + tree_i_state_class = get_tree_state_reg(self._onedal_model, i) + tree_i_state_dict = { + "max_depth": tree_i_state_class.max_depth, + "node_count": tree_i_state_class.node_count, + "nodes": check_tree_nodes(tree_i_state_class.node_ar), + "values": tree_i_state_class.value_ar, + } est_i.tree_ = Tree( self.n_features_in_, np.array([1], dtype=np.intp), self.n_outputs_ ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) - - return estimators_ + self._cached_estimators = estimators_ def _onedal_ready(self, X, y, sample_weight): # TODO: @@ -1022,8 +1010,6 @@ def _onedal_cpu_supported(self, method_name, *data): return False elif not self.n_outputs_ == 1: return False - elif hasattr(self, "estimators_"): - return False else: return True if method_name == "predict": @@ -1075,8 +1061,6 @@ def _onedal_gpu_supported(self, method_name, *data): return False elif self.oob_score: return False - elif hasattr(self, "estimators_"): - return False else: return True if method_name == "predict": @@ -1142,14 +1126,18 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): } if daal_check_version((2023, "P", 101)): onedal_params["splitter_mode"] = self.splitter_mode + + # Lazy evaluation of estimators_ self._cached_estimators_ = None + + # Compute self._onedal_estimator = self._onedal_regressor(**onedal_params) self._onedal_estimator.fit(X, y, sample_weight, queue=queue) self._save_attributes() if sklearn_check_version("1.2"): self._estimator = DecisionTreeRegressor() - self.estimators_ = self._estimators_ + return self def _onedal_predict(self, X, queue=None): From 73615569933bd7bd7e2f003ae0b88060313dd060 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 31 Jul 2023 06:20:59 -0700 Subject: [PATCH 39/52] incomplete merge --- sklearnex/preview/ensemble/extra_trees.py | 30 ----------------------- 1 file changed, 30 deletions(-) diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index d568f33ccc..2055d4e483 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -556,12 +556,6 @@ def n_features_(self): return self.n_features_in_ def _estimators_(self): -<<<<<<< HEAD -======= - if hasattr(self, "_cached_estimators_"): - if self._cached_estimators_: - return self._cached_estimators_ ->>>>>>> master if sklearn_check_version("0.22"): check_is_fitted(self) else: @@ -608,10 +602,6 @@ def _estimators_(self): "nodes": check_tree_nodes(tree_i_state_class.node_ar), "values": tree_i_state_class.value_ar, } -<<<<<<< HEAD - -======= ->>>>>>> master est_i.tree_ = Tree( self.n_features_in_, np.array([n_classes_], dtype=np.intp), @@ -726,11 +716,7 @@ def _onedal_gpu_supported(self, method_name, *data): (self.warm_start is False, "Warm start is not supported."), ( daal_check_version((2023, "P", 100)), -<<<<<<< HEAD "ExtraTrees supported starting from oneDAL version 2023.1", -======= - "ExtraTrees only supported starting from oneDAL version 2023.1", ->>>>>>> master ), ] ) @@ -840,11 +826,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): onedal_params["min_impurity_split"] = self.min_impurity_split else: onedal_params["min_impurity_split"] = None -<<<<<<< HEAD # Lazy evaluation of estimators_ -======= ->>>>>>> master self._cached_estimators_ = None # Compute @@ -990,12 +973,6 @@ def __init__( self.min_bin_size = min_bin_size def _estimators_(self): -<<<<<<< HEAD -======= - if hasattr(self, "_cached_estimators_"): - if self._cached_estimators_: - return self._cached_estimators_ ->>>>>>> master if sklearn_check_version("0.22"): check_is_fitted(self) else: @@ -1031,10 +1008,6 @@ def _estimators_(self): est_i.n_features_ = self.n_features_in_ est_i.n_classes_ = 1 est_i.n_outputs_ = self.n_outputs_ -<<<<<<< HEAD - -======= ->>>>>>> master tree_i_state_class = get_tree_state_reg(self._onedal_model, i) tree_i_state_dict = { "max_depth": tree_i_state_class.max_depth, @@ -1341,11 +1314,8 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): } if daal_check_version((2023, "P", 101)): onedal_params["splitter_mode"] = "random" -<<<<<<< HEAD # Lazy evaluation of estimators_ -======= ->>>>>>> master self._cached_estimators_ = None self._onedal_estimator = self._onedal_regressor(**onedal_params) From 009b1d7ec495003fa4f3b7aa6b0fed25fccda838 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 31 Jul 2023 06:25:22 -0700 Subject: [PATCH 40/52] remove vestigial code --- onedal/primitives/tree_visitor.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 75932bc13c..081c7ca027 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -18,7 +18,6 @@ #include "onedal/common.hpp" #include "oneapi/dal/algo/decision_forest.hpp" #include "numpy/arrayobject.h" -#include "onedal/version.hpp" #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #define ONEDAL_PY_TERMINAL_NODE -1 #define ONEDAL_PY_NO_FEATURE -2 @@ -102,14 +100,6 @@ class node_count_visitor { return true; } - /*node_count_visitor(node_count_visitor&&) = default; - bool operator()(const df::leaf_node_info& info) { - return call(info); - } - bool operator()(const df::split_node_info& info) { - return call(info); - }*/ - std::size_t n_nodes; std::size_t depth; @@ -147,14 +137,6 @@ class to_sklearn_tree_object_visitor : public tree_state { double* value_ar_ptr; skl_tree_node* node_ar_ptr; - /*to_sklearn_tree_object_visitor(to_sklearn_tree_object_visitor&&) = default; - bool operator()(const df::leaf_node_info& info) { - return call(info); - } - bool operator()(const df::split_node_info& info) { - return call(info); - }*/ - protected: std::size_t node_id; std::size_t max_n_classes; From 1c02e385948a6d832929f311fabacfc2b2cd60ff Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 31 Jul 2023 06:28:16 -0700 Subject: [PATCH 41/52] back in black --- sklearnex/preview/ensemble/extra_trees.py | 8 +++++--- sklearnex/preview/ensemble/forest.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index 2055d4e483..cc0d14484e 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -193,12 +193,14 @@ def check_sample_weight(self, sample_weight, X, dtype=None): @property def estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_ is None and self._onedal_model: self._estimators_() return self._cached_estimators_ else: - raise AttributeError(f"'{self.__class__.__name__}' has no attribute 'estimators_'") + raise AttributeError( + f"'{self.__class__.__name__}' has no attribute 'estimators_'" + ) @estimators_.setter def estimators_(self, estimators): @@ -1021,7 +1023,7 @@ def _estimators_(self): ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) - + self._cached_estimators_ = estimators_ def _onedal_fit_ready(self, patching_status, X, y, sample_weight): diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 9ee47f8e2a..881f9c7530 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -194,12 +194,14 @@ def check_sample_weight(self, sample_weight, X, dtype=None): @property def estimators_(self): - if hasattr(self, '_cached_estimators_'): + if hasattr(self, "_cached_estimators_"): if self._cached_estimators_ is None and self._onedal_model: self._estimators_() return self._cached_estimators_ else: - raise AttributeError(f"'{self.__class__.__name__}' has no attribute 'estimators_'") + raise AttributeError( + f"'{self.__class__.__name__}' has no attribute 'estimators_'" + ) @estimators_.setter def estimators_(self, estimators): From 61e891dc24823dda994cca2abee70aedd9b13b37 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 31 Jul 2023 06:30:53 -0700 Subject: [PATCH 42/52] add const-ness --- onedal/primitives/tree_visitor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 081c7ca027..a09f3a08ac 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -252,9 +252,9 @@ template <> bool to_sklearn_tree_object_visitor::call( const df::leaf_node_info& info) { - std::size_t depth = static_cast(info.get_level()); - std::size_t label = info.get_response(); // these may be a slow accesses due to oneDAL abstraction - double nNodeSampleCount = info.get_sample_count(); // do them only once + const std::size_t depth = static_cast(info.get_level()); + const std::size_t label = info.get_response(); // these may be a slow accesses due to oneDAL abstraction + const double nNodeSampleCount = info.get_sample_count(); // do them only once while(depth--) { From 34f536d5016cb3b7e20ec3989caee1bb22d45d83 Mon Sep 17 00:00:00 2001 From: icfaust Date: Mon, 31 Jul 2023 06:31:45 -0700 Subject: [PATCH 43/52] remove const from depth --- onedal/primitives/tree_visitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index a09f3a08ac..5495fb6612 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -252,7 +252,7 @@ template <> bool to_sklearn_tree_object_visitor::call( const df::leaf_node_info& info) { - const std::size_t depth = static_cast(info.get_level()); + std::size_t depth = static_cast(info.get_level()); const std::size_t label = info.get_response(); // these may be a slow accesses due to oneDAL abstraction const double nNodeSampleCount = info.get_sample_count(); // do them only once From 7cd70c24feac7787d70d43f2550e1e29abf75b15 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 2 Aug 2023 13:16:07 +0200 Subject: [PATCH 44/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: Alexander Andreev --- onedal/primitives/tree_visitor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index 5495fb6612..c02f982a65 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -82,7 +82,6 @@ struct tree_state { std::size_t leaf_count; std::size_t class_count; }; - // Declaration and implementation. template class node_count_visitor { From d411d36f869f0ef13636cdd547e1ede053fba363 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 2 Aug 2023 13:16:17 +0200 Subject: [PATCH 45/52] Update onedal/primitives/tree_visitor.cpp Co-authored-by: Alexander Andreev --- onedal/primitives/tree_visitor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp index c02f982a65..3ced21967e 100644 --- a/onedal/primitives/tree_visitor.cpp +++ b/onedal/primitives/tree_visitor.cpp @@ -273,8 +273,8 @@ template void init_get_tree_state(py::module_& m) { using namespace decision_forest; using model_t = model; - using tree_state_t = tree_state; - + using tree_state_t = tree_state; + // TODO: // create one instance for cls and reg. py::class_(m, "get_tree_state") From 9d26f0a3ec567e198695dd992f16f6381c36ede8 Mon Sep 17 00:00:00 2001 From: Ian Faust Date: Wed, 2 Aug 2023 15:23:06 +0200 Subject: [PATCH 46/52] Update onedal/ensemble/forest.py Co-authored-by: Alexander Andreev --- onedal/ensemble/forest.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py index 5632269f07..de33a27597 100644 --- a/onedal/ensemble/forest.py +++ b/onedal/ensemble/forest.py @@ -364,16 +364,13 @@ def _fit(self, X, y, sample_weight, module, queue): if sample_weight is not None and len(sample_weight) > 0: sample_weight = self._get_sample_weight(sample_weight, X) - policy = self._get_policy(queue, X, y, sample_weight) - X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight) - params = self._get_onedal_params(X) - train_result = module.train(policy, params, *to_table(X, y, sample_weight)) - + data = (X, y, sample_weight) else: - policy = self._get_policy(queue, X, y) - X, y = _convert_to_supported(policy, X, y) - params = self._get_onedal_params(X) - train_result = module.train(policy, params, *to_table(X, y)) + data = (X, y) + policy = self._get_policy(queue, *data) + data = _convert_to_supported(policy, *data) + params = self._get_onedal_params(data[0]) + train_result = module.train(policy, params, *to_table(*data)) self._onedal_model = train_result.model From 9dd485b98c2cc283ebddcf257d1100082a87d023 Mon Sep 17 00:00:00 2001 From: icfaust Date: Wed, 2 Aug 2023 11:00:25 -0700 Subject: [PATCH 47/52] remove unneccesary checks breaking lazy evaluation --- sklearnex/preview/ensemble/extra_trees.py | 20 +++++++++----------- sklearnex/preview/ensemble/forest.py | 18 +++++++----------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index cc0d14484e..23f95cc663 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -558,10 +558,8 @@ def n_features_(self): return self.n_features_in_ def _estimators_(self): - if sklearn_check_version("0.22"): - check_is_fitted(self) - else: - check_is_fitted(self, "_onedal_model") + # _estimators_ should only be called if _onedal_model exists + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators @@ -847,7 +845,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): def _onedal_predict(self, X, queue=None): X = check_array(X, dtype=[np.float32, np.float64]) - check_is_fitted(self) + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -856,7 +854,7 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) - check_is_fitted(self) + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): @@ -975,10 +973,8 @@ def __init__( self.min_bin_size = min_bin_size def _estimators_(self): - if sklearn_check_version("0.22"): - check_is_fitted(self) - else: - check_is_fitted(self, "_onedal_model") + # _estimators_ should only be called if _onedal_model exists + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { "criterion": self.criterion, @@ -1330,9 +1326,11 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): return self def _onedal_predict(self, X, queue=None): + X = check_array(X, dtype=[np.float32, np.float64]) + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - X = self._validate_X_predict(X) + return self._onedal_estimator.predict(X, queue=queue) def fit(self, X, y, sample_weight=None): diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 881f9c7530..517682617a 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -523,10 +523,7 @@ def n_features_(self): return self.n_features_in_ def _estimators_(self): - if sklearn_check_version("0.22"): - check_is_fitted(self) - else: - check_is_fitted(self, "_onedal_model") + check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] n_classes_ = self.n_classes_[0] # convert model to estimators @@ -774,7 +771,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): def _onedal_predict(self, X, queue=None): X = check_array(X, dtype=[np.float32, np.float64]) - check_is_fitted(self) + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -783,7 +780,7 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) - check_is_fitted(self) + if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) if sklearn_check_version("1.0"): @@ -909,10 +906,7 @@ def __init__( self.splitter_mode = splitter_mode def _estimators_(self): - if sklearn_check_version("0.22"): - check_is_fitted(self) - else: - check_is_fitted(self, "_onedal_model") + check_is_fitted(self, "_onedal_model") # convert model to estimators params = { "criterion": self.criterion, @@ -1141,9 +1135,11 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): return self def _onedal_predict(self, X, queue=None): + X = check_array(X, dtype=[np.float32, np.float64]) + if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) - X = self._validate_X_predict(X) + return self._onedal_estimator.predict(X, queue=queue) def fit(self, X, y, sample_weight=None): From 11feca2f3f6cd8285c3fd79e812c11be2d76f3d8 Mon Sep 17 00:00:00 2001 From: icfaust Date: Wed, 2 Aug 2023 11:10:30 -0700 Subject: [PATCH 48/52] readded check_if_fitted --- sklearnex/preview/ensemble/extra_trees.py | 3 +++ sklearnex/preview/ensemble/forest.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index 23f95cc663..a7d1e5d76e 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -845,6 +845,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): def _onedal_predict(self, X, queue=None): X = check_array(X, dtype=[np.float32, np.float64]) + check_is_fitted(self, "_onedal_model") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -854,6 +855,7 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) + check_is_fitted(self, "_onedal_model") if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) @@ -1327,6 +1329,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): def _onedal_predict(self, X, queue=None): X = check_array(X, dtype=[np.float32, np.float64]) + check_is_fitted(self, "_onedal_model") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 517682617a..66b6df72c6 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -771,6 +771,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): def _onedal_predict(self, X, queue=None): X = check_array(X, dtype=[np.float32, np.float64]) + check_is_fitted(self, "_onedal_model") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) @@ -780,6 +781,7 @@ def _onedal_predict(self, X, queue=None): def _onedal_predict_proba(self, X, queue=None): X = check_array(X, dtype=[np.float64, np.float32]) + check_is_fitted(self, "_onedal_model") if sklearn_check_version("0.23"): self._check_n_features(X, reset=False) @@ -1136,6 +1138,7 @@ def _onedal_fit(self, X, y, sample_weight=None, queue=None): def _onedal_predict(self, X, queue=None): X = check_array(X, dtype=[np.float32, np.float64]) + check_is_fitted(self, "_onedal_model") if sklearn_check_version("1.0"): self._check_feature_names(X, reset=False) From 1a1ec39e94e2765db2d3eb130402d46bf4224817 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 28 Aug 2023 04:29:31 -0700 Subject: [PATCH 49/52] ET/RF fixes for estimators caching --- sklearnex/preview/ensemble/extra_trees.py | 4 +++- sklearnex/preview/ensemble/forest.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearnex/preview/ensemble/extra_trees.py b/sklearnex/preview/ensemble/extra_trees.py index 36d63e8c55..6645f9f1f9 100644 --- a/sklearnex/preview/ensemble/extra_trees.py +++ b/sklearnex/preview/ensemble/extra_trees.py @@ -561,7 +561,9 @@ def _estimators_(self): # _estimators_ should only be called if _onedal_model exists check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] - n_classes_ = self.n_classes_[0] + n_classes_ = ( + self.n_classes_ if isinstance(self.n_classes_, int) else self.n_classes_[0] + ) # convert model to estimators params = { "criterion": self.criterion, diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 7e4aa5bc10..028345aaf4 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -1078,7 +1078,7 @@ def _estimators_(self): ) est_i.tree_.__setstate__(tree_i_state_dict) estimators_.append(est_i) - self._cached_estimators = estimators_ + self._cached_estimators_ = estimators_ def _onedal_ready(self, X, y, sample_weight): # TODO: From 87b49e2833f7e07ac0b9b95772a65177677bea8b Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 28 Aug 2023 11:35:20 -0700 Subject: [PATCH 50/52] Fix monotonicity check --- sklearnex/preview/ensemble/forest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 4aeab1e9c0..78735536d6 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -446,9 +446,7 @@ def _onedal_ready(self, X, y, sample_weight): correct_ccp_alpha = self.ccp_alpha == 0.0 correct_criterion = self.criterion == "gini" correct_warm_start = self.warm_start is False - correct_monotonic_cst = ( - sklearn_check_version("1.4") and self.monotonic_cst is None - ) + correct_monotonic_cst = getattr(self, "monotonic_cst", None) is None if correct_sparsity and sklearn_check_version("1.4"): try: _assert_all_finite(X) From e0275e56f72e9b40711aa8b10afc03360acc180d Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 28 Aug 2023 11:59:56 -0700 Subject: [PATCH 51/52] ET/RF fixes for estimators caching --- sklearnex/preview/ensemble/forest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index 78735536d6..fc1f3ae9d0 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -593,7 +593,9 @@ def n_features_(self): def _estimators_(self): check_is_fitted(self, "_onedal_model") classes_ = self.classes_[0] - n_classes_ = self.n_classes_[0] + n_classes_ = ( + self.n_classes_ if isinstance(self.n_classes_, int) else self.n_classes_[0] + ) # convert model to estimators params = { "criterion": self.criterion, From 050af653677985e4dbb84a761aaca160334b8473 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 28 Aug 2023 12:10:10 -0700 Subject: [PATCH 52/52] Fix RF parameter check --- sklearnex/preview/ensemble/forest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearnex/preview/ensemble/forest.py b/sklearnex/preview/ensemble/forest.py index fc1f3ae9d0..64f4558108 100755 --- a/sklearnex/preview/ensemble/forest.py +++ b/sklearnex/preview/ensemble/forest.py @@ -470,6 +470,7 @@ def _onedal_ready(self, X, y, sample_weight): correct_warm_start, correct_monotonic_cst, correct_finiteness, + self.class_weight != "balanced_subsample", ] ) if ready: