intel · Alexsandruss · Aug 28, 2023 · Jun 14, 2023 · Jun 15, 2023 · Jun 15, 2023
diff --git a/onedal/ensemble/forest.cpp b/onedal/ensemble/forest.cpp
@@ -216,6 +216,17 @@ void init_train_ops(py::module_& m) {
               train_ops ops(policy, input_t{ data, responses, weights }, params2desc{});
               return fptype2t{ method2t{ Task{}, ops } }(params);
           });
+    m.def("train",
+          [](const Policy& policy,
+             const py::dict& params,
+             const table& data,
+             const table& responses) {
+              using namespace decision_forest;
+              using input_t = train_input<Task>;
+
+              train_ops ops(policy, input_t{ data, responses}, params2desc{});
+              return fptype2t{ method2t{ Task{}, ops } }(params);
+          });
 }
 
 template <typename Policy, typename Task>

diff --git a/onedal/ensemble/forest.py b/onedal/ensemble/forest.py
@@ -305,40 +305,21 @@ def _validate_targets(self, y, dtype):
         self.classes_ = None
         return _column_or_1d(y, warn=True).astype(dtype, copy=False)
 
-    def _get_sample_weight(self, X, y, sample_weight):
-        n_samples = X.shape[0]
-        dtype = X.dtype
-        if n_samples == 1:
-            raise ValueError("n_samples=1")
-
-        sample_weight = np.asarray([]
-                                   if sample_weight is None
-                                   else sample_weight, dtype=dtype)
-        sample_weight = sample_weight.ravel()
-
-        sample_weight_count = sample_weight.shape[0]
-        if sample_weight_count != 0 and sample_weight_count != n_samples:
+    def _get_sample_weight(self, sample_weight, X):
+        sample_weight = np.asarray(sample_weight, dtype=X.dtype).ravel()
+
+        if sample_weight.size != X.shape[0]:
             raise ValueError("sample_weight and X have incompatible shapes: "
                              "%r vs %r\n"
                              "Note: Sparse matrices cannot be indexed w/"
                              "boolean masks (use `indices=True` in CV)."
-                             % (len(sample_weight), X.shape))
+                             % (sample_weight.shape, X.shape))
 
-        if sample_weight_count == 0:
-            sample_weight = np.ones(n_samples, dtype=dtype)
-        elif isinstance(sample_weight, Number):
-            sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
-        else:
-            sample_weight = _check_array(
-                sample_weight, accept_sparse=False, ensure_2d=False,
-                dtype=dtype, order="C"
-            )
-            if sample_weight.ndim != 1:
-                raise ValueError("Sample weights must be 1D array or scalar")
+        sample_weight = _check_array(
+            sample_weight, accept_sparse=False, ensure_2d=False,
+            dtype=X.dtype, order="C"
+        )
 
-            if sample_weight.shape != (n_samples,):
-                raise ValueError("sample_weight.shape == {}, expected {}!"
-                                 .format(sample_weight.shape, (n_samples,)))
         return sample_weight
 
     def _get_policy(self, queue, *data):
@@ -349,17 +330,24 @@ def _fit(self, X, y, sample_weight, module, queue):
             X, y, dtype=[np.float64, np.float32],
             force_all_finite=True, accept_sparse='csr')
         y = self._validate_targets(y, X.dtype)
-        sample_weight = self._get_sample_weight(X, y, sample_weight)
 
         self.n_features_in_ = X.shape[1]
         if not sklearn_check_version('1.0'):
             self.n_features_ = self.n_features_in_
-        policy = self._get_policy(queue, X, y, sample_weight)
 
-        X, y, sample_weight = _convert_to_supported(policy, X, y, sample_weight)
+        data = [X, y]
+
+        if sample_weight is not None and len(sample_weight) > 0:
+            sample_weight = self._get_sample_weight(sample_weight, X)
+            data.append(sample_weight)
+
+        policy = self._get_policy(queue, *data)
+
+        #pass as *data
+        data = _convert_to_supported(policy, *data)
         params = self._get_onedal_params(X)
         train_result = module.train(
-            policy, params, *to_table(X, y, sample_weight))
+            policy, params, *to_table(*data))
         self._onedal_model = train_result.model
 
         if self.oob_score:
@@ -533,7 +521,7 @@ def __init__(self,
                  voting_mode='weighted',
                  error_metric_mode='none',
                  variable_importance_mode='none',
-                 algorithm='dense',
+                 algorithm='hist',
                  **kwargs):
         super().__init__(
             n_estimators=n_estimators,

diff --git a/onedal/primitives/tree_visitor.cpp b/onedal/primitives/tree_visitor.cpp
@@ -75,14 +75,45 @@ class skl_tree_node {
 // We only expose the minimum information to python
 template <typename T>
 struct tree_state {
+    skl_tree_node * node_ar;
+    double * value_ar;
+    std::size_t max_depth;
+    std::size_t node_count;
+    std::size_t leaf_count;
+    std::size_t class_count;
+};
+
+template <typename T>
+class tree_state_py {
+public:
     py::array_t<skl_tree_node> node_ar;
     py::array_t<double> value_ar;
     std::size_t max_depth;
     std::size_t node_count;
     std::size_t leaf_count;
     std::size_t class_count;
+
+    tree_state_py(tree_state<T> inp){
+        this->max_depth = inp.max_depth;
+        this->node_count = inp.node_count;        
+        this->leaf_count = inp.leaf_count;
+        this->class_count = inp.class_count;
+
+        auto node_ar_shape = py::array::ShapeContainer({ this->node_count });
+        auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) });
+
+        auto value_ar_shape = py::array::ShapeContainer({ static_cast<Py_ssize_t>(this->node_count),
+                                                      1,
+                                                      static_cast<Py_ssize_t>(this->class_count) });
+        auto value_ar_strides = py::array::StridesContainer(
+        { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) });
+
+        this->node_ar = py::array_t<skl_tree_node>(node_ar_shape, node_ar_strides, inp.node_ar);
+        this->value_ar = py::array_t<double>(value_ar_shape, value_ar_strides, inp.value_ar);
+    }
 };
 
+
 // Declaration and implementation.
 template <typename Task>
 class node_count_visitor {
@@ -153,52 +184,31 @@ to_sklearn_tree_object_visitor<Task>::to_sklearn_tree_object_visitor(std::size_t
     this->max_depth = _depth;
     this->leaf_count = _n_leafs;
     this->class_count = _max_n_classes;
-
-    auto node_ar_shape = py::array::ShapeContainer({ this->node_count });
-    auto node_ar_strides = py::array::StridesContainer({ sizeof(skl_tree_node) });
-
-    auto value_ar_shape = py::array::ShapeContainer({ static_cast<Py_ssize_t>(this->node_count),
-                                                      1,
-                                                      static_cast<Py_ssize_t>(this->class_count) });
-    auto value_ar_strides = py::array::StridesContainer(
-        { this->class_count * sizeof(double), this->class_count * sizeof(double), sizeof(double) });
-
-    skl_tree_node* node_ar_ptr = new skl_tree_node[this->node_count];
-
-
     OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, this->node_count, this->class_count);
-    double* value_ar_ptr =
-        new double[this->node_count * 1 *
-                   this->class_count](); // oneDAL only supports scalar responses for now
-
-    this->node_ar = py::array_t<skl_tree_node>(node_ar_shape, node_ar_strides, node_ar_ptr);
-    this->value_ar = py::array_t<double>(value_ar_shape, value_ar_strides, value_ar_ptr);
+    this->node_ar = new skl_tree_node[this->node_count];
+    this->value_ar = new double[this->node_count * this->class_count](); // oneDAL only supports scalar responses for now
 }
 
 template <typename Task>
 bool to_sklearn_tree_object_visitor<Task>::call(const df::split_node_info<Task>& info) {
-    py::buffer_info node_ar_buf = this->node_ar.request();
-
-    skl_tree_node* node_ar_ptr = static_cast<skl_tree_node*>(node_ar_buf.ptr);
-
     if (info.get_level() > 0) {
         // has parents
         Py_ssize_t parent = parents[info.get_level() - 1];
-        if (node_ar_ptr[parent].left_child > 0) {
-            assert(node_ar_ptr[node_id].right_child < 0);
-            node_ar_ptr[parent].right_child = node_id;
+        if (this->node_ar[parent].left_child > 0) {
+            assert(this->node_ar[node_id].right_child < 0);
+            this->node_ar[parent].right_child = node_id;
         }
         else {
-            node_ar_ptr[parent].left_child = node_id;
+            this->node_ar[parent].left_child = node_id;
         }
     }
     parents[info.get_level()] = node_id;
-    node_ar_ptr[node_id].feature = info.get_feature_index();
-    node_ar_ptr[node_id].threshold = info.get_feature_value();
-    node_ar_ptr[node_id].impurity = info.get_impurity();
-    node_ar_ptr[node_id].n_node_samples = info.get_sample_count();
-    node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count();
-    node_ar_ptr[node_id].missing_go_to_left = false;
+    this->node_ar[node_id].feature = info.get_feature_index();
+    this->node_ar[node_id].threshold = info.get_feature_value();
+    this->node_ar[node_id].impurity = info.get_impurity();
+    this->node_ar[node_id].n_node_samples = info.get_sample_count();
+    this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count();
+    this->node_ar[node_id].missing_go_to_left = false;
 
     // wrap-up
     ++node_id;
@@ -208,25 +218,21 @@ bool to_sklearn_tree_object_visitor<Task>::call(const df::split_node_info<Task>&
 // stuff that is done for all leaf node types
 template <typename Task>
 void to_sklearn_tree_object_visitor<Task>::_onLeafNode(const df::leaf_node_info<Task>& info) {
-    py::buffer_info node_ar_buf = this->node_ar.request();
-
-    skl_tree_node* node_ar_ptr = static_cast<skl_tree_node*>(node_ar_buf.ptr);
-
     if (info.get_level()) {
         Py_ssize_t parent = parents[info.get_level() - 1];
-        if (node_ar_ptr[parent].left_child > 0) {
-            assert(node_ar_ptr[node_id].right_child < 0);
-            node_ar_ptr[parent].right_child = node_id;
+        if (this->node_ar[parent].left_child > 0) {
+            assert(this->node_ar[node_id].right_child < 0);
+            this->node_ar[parent].right_child = node_id;
         }
         else {
-            node_ar_ptr[parent].left_child = node_id;
+            this->node_ar[parent].left_child = node_id;
         }
     }
 
-    node_ar_ptr[node_id].impurity = info.get_impurity();
-    node_ar_ptr[node_id].n_node_samples = info.get_sample_count();
-    node_ar_ptr[node_id].weighted_n_node_samples = info.get_sample_count();
-    node_ar_ptr[node_id].missing_go_to_left = false;
+    this->node_ar[node_id].impurity = info.get_impurity();
+    this->node_ar[node_id].n_node_samples = info.get_sample_count();
+    this->node_ar[node_id].weighted_n_node_samples = info.get_sample_count();
+    this->node_ar[node_id].missing_go_to_left = false;
 }
 
 template <>
@@ -235,10 +241,7 @@ bool to_sklearn_tree_object_visitor<df::task::regression>::call(
     _onLeafNode(info);
     OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, node_id, class_count);
 
-    py::buffer_info value_ar_buf = this->value_ar.request();
-    double* value_ar_ptr = static_cast<double*>(value_ar_buf.ptr);
-
-    value_ar_ptr[node_id * 1 * this->class_count] = info.get_response();
+    this->value_ar[node_id * this->class_count] = info.get_response();
 
     // wrap-up
     ++node_id;
@@ -248,8 +251,6 @@ bool to_sklearn_tree_object_visitor<df::task::regression>::call(
 template <>
 bool to_sklearn_tree_object_visitor<df::task::classification>::call(
     const df::leaf_node_info<df::task::classification>& info) {
-    py::buffer_info value_ar_buf = this->value_ar.request();
-    double* value_ar_ptr = static_cast<double*>(value_ar_buf.ptr);
 
     if (info.get_level() > 0) {
         std::size_t depth = static_cast<const std::size_t>(info.get_level()) - 1;
@@ -258,7 +259,7 @@ bool to_sklearn_tree_object_visitor<df::task::classification>::call(
             OVERFLOW_CHECK_BY_MULTIPLICATION(std::size_t, id, this->class_count);
             const auto row = id * 1 * this->class_count;
             OVERFLOW_CHECK_BY_ADDING(std::size_t, row, info.get_response());
-            value_ar_ptr[row + info.get_response()] += info.get_sample_count();
+            this->value_ar[row + info.get_response()] += info.get_sample_count();
             if (depth == 0) {
                 break;
             }
@@ -267,7 +268,7 @@ bool to_sklearn_tree_object_visitor<df::task::classification>::call(
     }
     _onLeafNode(info);
     OVERFLOW_CHECK_BY_ADDING(std::size_t, node_id * 1 * this->class_count, info.get_response());
-    value_ar_ptr[node_id * 1 * this->class_count + info.get_response()] += info.get_sample_count();
+    this->value_ar[node_id * this->class_count + info.get_response()] += info.get_sample_count();
 
     // wrap-up
     ++node_id;
@@ -278,7 +279,7 @@ template <typename Task>
 void init_get_tree_state(py::module_& m) {
     using namespace decision_forest;
     using model_t = model<Task>;
-    using tree_state_t = tree_state<Task>;
+    using tree_state_t = tree_state_py<Task>;
 
     // TODO:
     // create one instance for cls and reg.