Basic statistics algorithm for sparse data (oneapi-src#2548)

* Add basic statistics csr prototype * Add new files * Dispatching added * add files * Add csr random generation to test engine and minor algo's updates * Working version without full testing * Add CSR table builder in test engine * Minor refactoring * Tests and algorithm is ready * Fix host test failures: set csr generation to one based * Add copy constructor for csr_table and minor refactoring * Minor refactoring * Apply clang-format * Minor fixes * Minor code suggestion from PR * Add distributed mode * Apply suggestions * DAAL kernel debugging * Add primitives to BUILD * Add primitives to BUILD * Add debug prints * Fix public CI test failure. Undefined behaviour with nanf * remove reduntant includes * Specify USM memory type for accessor * Minor updates: add doc comment, update table init for csr_builder * Change dispatching for basic statistics from compiletime to runtime * Apply clang-format and add asserts * Update main kernel * Code cleaning * Apply clang-format * Apply PR's suggestions * Kernel optimizations * Update test table generation (remove primitives from test namespace) * Remove temporary copy constructor from csr table * Proper rebase on master * Add more test cases * Change kernel dispatching for CPU
inteldimitrius · Nov 22, 2023 · d1ed4fd · d1ed4fd
1 parent 4c9a24c
commit d1ed4fd
Show file tree

Hide file tree

Showing 15 changed files with 873 additions and 33 deletions.
diff --git a/...tics/backend/cpu/compute_kernel_dense.cpp → ...statistics/backend/cpu/compute_kernel.cpp b/...tics/backend/cpu/compute_kernel_dense.cpp → ...statistics/backend/cpu/compute_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2023 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,6 @@
 namespace oneapi::dal::basic_statistics::backend {
 
 using dal::backend::context_cpu;
-using method_t = method::dense;
 using task_t = task::compute;
 using input_t = compute_input<task_t>;
 using result_t = compute_result<task_t>;
@@ -41,19 +40,26 @@ using descriptor_t = detail::descriptor_base<task_t>;
 namespace daal_lom = daal::algorithms::low_order_moments;
 namespace interop = dal::backend::interop;
 
-template <typename Float, daal::CpuType Cpu>
-using daal_lom_batch_kernel_t =
- daal_lom::internal::LowOrderMomentsBatchKernel<Float, daal_lom::defaultDense, Cpu>;
+template <daal_lom::Method Value>
+using daal_method_constant = std::integral_constant<daal_lom::Method, Value>;
+
+template <typename Method>
+struct to_daal_method;
+
+template <>
+struct to_daal_method<method::dense> : daal_method_constant<daal_lom::defaultDense> {};
+
+template <>
+struct to_daal_method<method::sparse> : daal_method_constant<daal_lom::fastCSR> {};
+
+template <typename Float, daal::CpuType Cpu, typename Method>
+using batch_kernel_t =
+ daal_lom::internal::LowOrderMomentsBatchKernel<Float, to_daal_method<Method>::value, Cpu>;
 
 template <typename Float, daal::CpuType Cpu>
 using daal_lom_online_kernel_t =
  daal_lom::internal::LowOrderMomentsOnlineKernel<Float, daal_lom::defaultDense, Cpu>;
 
-template <typename Method>
-constexpr daal_lom::Method get_daal_method() {
- return daal_lom::defaultDense;
-}
-
 template <typename Float>
 std::int64_t propose_block_size(std::int64_t row_count, std::int64_t col_count) {
  using idx_t = std::int64_t;
@@ -174,10 +180,12 @@ result_t call_daal_kernel_with_weights(const context_cpu& ctx,
  return result;
 }
 
-template <typename Float>
+template <typename Float, typename Method>
 result_t call_daal_kernel_without_weights(const context_cpu& ctx,
  const descriptor_t& desc,
  const table& data) {
+ auto daal_method =
+ std::is_same_v<method::dense, Method> ? daal_lom::defaultDense : daal_lom::fastCSR;
  const auto daal_data = interop::convert_to_daal_table<Float>(data);
 
  auto daal_parameter = daal_lom::Parameter(get_daal_estimates_to_compute(desc));
@@ -187,21 +195,22 @@ result_t call_daal_kernel_without_weights(const context_cpu& ctx,
  daal_input.set(daal_lom::InputId::data, daal_data);
 
  interop::status_to_exception(
- daal_result.allocate<Float>(&daal_input, &daal_parameter, get_daal_method<method_t>()));
+ daal_result.allocate<Float>(&daal_input, &daal_parameter, daal_method));
 
- interop::status_to_exception(
- interop::call_daal_kernel<Float, daal_lom_batch_kernel_t>(ctx,
- daal_data.get(),
- &daal_result,
- &daal_parameter));
+ interop::status_to_exception(dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) {
+ return batch_kernel_t<Float,
+ oneapi::dal::backend::interop::to_daal_cpu_type<decltype(cpu)>::value,
+ Method>()
+ .compute(daal_data.get(), &daal_result, &daal_parameter);
+ }));
 
  auto result =
  get_result<Float, task_t>(desc, daal_result).set_result_options(desc.get_result_options());
 
  return result;
 }
 
-template <typename Float>
+template <typename Float, typename Method>
 static result_t compute(const context_cpu& ctx, const descriptor_t& desc, const input_t& input) {
  if (input.get_weights().has_data()) {
  return call_daal_kernel_with_weights<Float>(ctx,
@@ -210,20 +219,22 @@ static result_t compute(const context_cpu& ctx, const descriptor_t& desc, const
  input.get_weights());
  }
  else {
- return call_daal_kernel_without_weights<Float>(ctx, desc, input.get_data());
+ return call_daal_kernel_without_weights<Float, Method>(ctx, desc, input.get_data());
  }
 }
 
-template <typename Float>
-struct compute_kernel_cpu<Float, method_t, task_t> {
+template <typename Float, typename Method>
+struct compute_kernel_cpu<Float, Method, task_t> {
  result_t operator()(const context_cpu& ctx,
  const descriptor_t& desc,
  const input_t& input) const {
- return compute<Float>(ctx, desc, input);
+ return compute<Float, Method>(ctx, desc, input);
  }
 };
 
-template struct compute_kernel_cpu<float, method_t, task_t>;
-template struct compute_kernel_cpu<double, method_t, task_t>;
+template struct compute_kernel_cpu<float, method::dense, task_t>;
+template struct compute_kernel_cpu<double, method::dense, task_t>;
+template struct compute_kernel_cpu<float, method::sparse, task_t>;
+template struct compute_kernel_cpu<double, method::sparse, task_t>;
 
 } // namespace oneapi::dal::basic_statistics::backend
diff --git a/cpp/oneapi/dal/algo/basic_statistics/backend/cpu/compute_kernel.hpp b/cpp/oneapi/dal/algo/basic_statistics/backend/cpu/compute_kernel.hpp
@@ -17,15 +17,17 @@
 #pragma once
 
 #include "oneapi/dal/algo/basic_statistics/compute_types.hpp"
+#include "oneapi/dal/table/csr.hpp"
 #include "oneapi/dal/backend/dispatcher.hpp"
 
 namespace oneapi::dal::basic_statistics::backend {
 
 template <typename Float, typename Method, typename Task>
 struct compute_kernel_cpu {
+ using input_t = compute_input<Task>;
  compute_result<Task> operator()(const dal::backend::context_cpu& ctx,
  const detail::descriptor_base<Task>& params,
- const compute_input<Task>& input) const;
+ const input_t& input) const;
 };
 
 } // namespace oneapi::dal::basic_statistics::backend
diff --git a/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/compute_kernel.hpp b/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/compute_kernel.hpp
@@ -17,15 +17,17 @@
 #pragma once
 
 #include "oneapi/dal/algo/basic_statistics/compute_types.hpp"
+#include "oneapi/dal/table/csr.hpp"
 #include "oneapi/dal/backend/dispatcher.hpp"
 
 namespace oneapi::dal::basic_statistics::backend {
 
 template <typename Float, typename Method, typename Task>
 struct compute_kernel_gpu {
+ using input_t = compute_input<Task>;
  compute_result<Task> operator()(const dal::backend::context_gpu& ctx,
  const detail::descriptor_base<Task>& params,
- const compute_input<Task>& input) const;
+ const input_t& input) const;
 };
 
 } // namespace oneapi::dal::basic_statistics::backend
diff --git a/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/compute_kernel_csr_impl.hpp b/cpp/oneapi/dal/algo/basic_statistics/backend/gpu/compute_kernel_csr_impl.hpp
@@ -0,0 +1,163 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "oneapi/dal/algo/basic_statistics/backend/gpu/compute_kernel.hpp"
+#include "oneapi/dal/backend/primitives/utils.hpp"
+#include "oneapi/dal/table/csr.hpp"
+#include "oneapi/dal/util/common.hpp"
+#include "oneapi/dal/detail/policy.hpp"
+#include "oneapi/dal/backend/communicator.hpp"
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+namespace oneapi::dal::basic_statistics::backend {
+
+namespace de = dal::detail;
+namespace bk = dal::backend;
+namespace pr = dal::backend::primitives;
+
+enum stat { min, max, sum, sum2, sum2_cent, mean, moment2, variance, stddev, variation };
+
+template <typename Float>
+class compute_kernel_csr_impl {
+ using method_t = method::sparse;
+ using task_t = task::compute;
+ using comm_t = bk::communicator<spmd::device_memory_access::usm>;
+ using input_t = compute_input<task_t>;
+ using result_t = compute_result<task_t>;
+ using descriptor_t = detail::descriptor_base<task_t>;
+
+public:
+ result_t operator()(const bk::context_gpu& ctx, const descriptor_t& desc, const input_t& input);
+
+private:
+ // Number of different basic statistics
+ static constexpr std::int32_t res_opt_count_ = 10;
+ // An array of basic statistics
+ const result_option_id res_options_[res_opt_count_] = { result_options::min,
+ result_options::max,
+ result_options::sum,
+ result_options::sum_squares,
+ result_options::sum_squares_centered,
+ result_options::mean,
+ result_options::second_order_raw_moment,
+ result_options::variance,
+ result_options::standard_deviation,
+ result_options::variation };
+
+ result_t get_result(sycl::queue q,
+ const pr::ndarray<Float, 2> computed_result,
+ result_option_id requested_results,
+ const std::vector<sycl::event>& deps = {}) {
+ result_t res;
+ std::vector<sycl::event> res_events;
+ res.set_result_options(requested_results);
+ if (requested_results.test(result_options::min)) {
+ auto index = get_result_option_index(result_options::min);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_min(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::max)) {
+ auto index = get_result_option_index(result_options::max);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_max(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::sum)) {
+ auto index = get_result_option_index(result_options::sum);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_sum(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::sum_squares)) {
+ auto index = get_result_option_index(result_options::sum_squares);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_sum_squares(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::sum_squares_centered)) {
+ auto index = get_result_option_index(result_options::sum_squares_centered);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_sum_squares_centered(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::mean)) {
+ auto index = get_result_option_index(result_options::mean);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_mean(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::second_order_raw_moment)) {
+ auto index = get_result_option_index(result_options::second_order_raw_moment);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_second_order_raw_moment(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::variance)) {
+ auto index = get_result_option_index(result_options::variance);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_variance(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::standard_deviation)) {
+ auto index = get_result_option_index(result_options::standard_deviation);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_standard_deviation(res_table);
+ res_events.push_back(event);
+ }
+ if (requested_results.test(result_options::variation)) {
+ auto index = get_result_option_index(result_options::variation);
+ auto [res_table, event] = get_result_table(q, computed_result, index, deps);
+ res.set_variation(res_table);
+ res_events.push_back(event);
+ }
+ sycl::event::wait_and_throw(res_events);
+ return res;
+ }
+
+ std::tuple<table, sycl::event> get_result_table(sycl::queue q,
+ const pr::ndarray<Float, 2> computed_result,
+ std::int32_t index,
+ const std::vector<sycl::event>& deps = {}) {
+ ONEDAL_ASSERT(computed_result.has_data());
+ auto column_count = computed_result.get_dimension(1);
+ const auto arr = dal::array<Float>::empty(column_count);
+ const auto res_arr_ptr = arr.get_mutable_data();
+ const auto computed_res_ptr = computed_result.get_data() + index * column_count;
+ auto event =
+ dal::backend::copy_usm2host(q, res_arr_ptr, computed_res_ptr, column_count, deps);
+ return std::make_tuple(homogen_table::wrap(arr, 1, column_count), event);
+ }
+
+ std::int32_t get_result_option_index(result_option_id opt) {
+ std::int32_t index = 0;
+ while (!opt.test(res_options_[index]))
+ ++index;
+ return index;
+ }
+
+ sycl::event finalize_for_distr(sycl::queue& q,
+ comm_t& communicator,
+ pr::ndarray<Float, 2>& results,
+ const input_t& input,
+ const std::vector<sycl::event>& deps);
+};
+
+} // namespace oneapi::dal::basic_statistics::backend
+#endif // ONEDAL_DATA_PARALLEL