Skip to content

Commit

Permalink
Basic statistics algorithm for sparse data (oneapi-src#2548)
Browse files Browse the repository at this point in the history
* Add basic statistics csr prototype

* Add new files

* Dispatching added

* add files

* Add csr random generation to test engine and minor algo's updates

* Working version without full testing

* Add CSR table builder in test engine

* Minor refactoring

* Tests and algorithm is ready

* Fix host test failures: set csr generation to one based

* Add copy constructor for csr_table and minor refactoring

* Minor refactoring

* Apply clang-format

* Minor fixes

* Minor code suggestion from PR

* Add distributed mode

* Apply suggestions

* DAAL kernel debugging

* Add primitives to BUILD

* Add primitives to BUILD

* Add debug prints

* Fix public CI test failure. Undefined behaviour with nanf

* remove reduntant includes

* Specify USM memory type for accessor

* Minor updates: add doc comment, update table init for csr_builder

* Change dispatching for basic statistics from compiletime to runtime

* Apply clang-format and add asserts

* Update main kernel

* Code cleaning

* Apply clang-format

* Apply PR's suggestions

* Kernel optimizations

* Update test table generation (remove primitives from test namespace)

* Remove temporary copy constructor from csr table

* Proper rebase on master

* Add more test cases

* Change kernel dispatching for CPU
  • Loading branch information
inteldimitrius committed Nov 22, 2023
1 parent 4c9a24c commit d1ed4fd
Show file tree
Hide file tree
Showing 15 changed files with 873 additions and 33 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2021 Intel Corporation
* Copyright 2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,7 +32,6 @@
namespace oneapi::dal::basic_statistics::backend {

using dal::backend::context_cpu;
using method_t = method::dense;
using task_t = task::compute;
using input_t = compute_input<task_t>;
using result_t = compute_result<task_t>;
Expand All @@ -41,19 +40,26 @@ using descriptor_t = detail::descriptor_base<task_t>;
namespace daal_lom = daal::algorithms::low_order_moments;
namespace interop = dal::backend::interop;

template <typename Float, daal::CpuType Cpu>
using daal_lom_batch_kernel_t =
daal_lom::internal::LowOrderMomentsBatchKernel<Float, daal_lom::defaultDense, Cpu>;
template <daal_lom::Method Value>
using daal_method_constant = std::integral_constant<daal_lom::Method, Value>;

template <typename Method>
struct to_daal_method;

template <>
struct to_daal_method<method::dense> : daal_method_constant<daal_lom::defaultDense> {};

template <>
struct to_daal_method<method::sparse> : daal_method_constant<daal_lom::fastCSR> {};

template <typename Float, daal::CpuType Cpu, typename Method>
using batch_kernel_t =
daal_lom::internal::LowOrderMomentsBatchKernel<Float, to_daal_method<Method>::value, Cpu>;

template <typename Float, daal::CpuType Cpu>
using daal_lom_online_kernel_t =
daal_lom::internal::LowOrderMomentsOnlineKernel<Float, daal_lom::defaultDense, Cpu>;

template <typename Method>
constexpr daal_lom::Method get_daal_method() {
return daal_lom::defaultDense;
}

template <typename Float>
std::int64_t propose_block_size(std::int64_t row_count, std::int64_t col_count) {
using idx_t = std::int64_t;
Expand Down Expand Up @@ -174,10 +180,12 @@ result_t call_daal_kernel_with_weights(const context_cpu& ctx,
return result;
}

template <typename Float>
template <typename Float, typename Method>
result_t call_daal_kernel_without_weights(const context_cpu& ctx,
const descriptor_t& desc,
const table& data) {
auto daal_method =
std::is_same_v<method::dense, Method> ? daal_lom::defaultDense : daal_lom::fastCSR;
const auto daal_data = interop::convert_to_daal_table<Float>(data);

auto daal_parameter = daal_lom::Parameter(get_daal_estimates_to_compute(desc));
Expand All @@ -187,21 +195,22 @@ result_t call_daal_kernel_without_weights(const context_cpu& ctx,
daal_input.set(daal_lom::InputId::data, daal_data);

interop::status_to_exception(
daal_result.allocate<Float>(&daal_input, &daal_parameter, get_daal_method<method_t>()));
daal_result.allocate<Float>(&daal_input, &daal_parameter, daal_method));

interop::status_to_exception(
interop::call_daal_kernel<Float, daal_lom_batch_kernel_t>(ctx,
daal_data.get(),
&daal_result,
&daal_parameter));
interop::status_to_exception(dal::backend::dispatch_by_cpu(ctx, [&](auto cpu) {
return batch_kernel_t<Float,
oneapi::dal::backend::interop::to_daal_cpu_type<decltype(cpu)>::value,
Method>()
.compute(daal_data.get(), &daal_result, &daal_parameter);
}));

auto result =
get_result<Float, task_t>(desc, daal_result).set_result_options(desc.get_result_options());

return result;
}

template <typename Float>
template <typename Float, typename Method>
static result_t compute(const context_cpu& ctx, const descriptor_t& desc, const input_t& input) {
if (input.get_weights().has_data()) {
return call_daal_kernel_with_weights<Float>(ctx,
Expand All @@ -210,20 +219,22 @@ static result_t compute(const context_cpu& ctx, const descriptor_t& desc, const
input.get_weights());
}
else {
return call_daal_kernel_without_weights<Float>(ctx, desc, input.get_data());
return call_daal_kernel_without_weights<Float, Method>(ctx, desc, input.get_data());
}
}

template <typename Float>
struct compute_kernel_cpu<Float, method_t, task_t> {
template <typename Float, typename Method>
struct compute_kernel_cpu<Float, Method, task_t> {
result_t operator()(const context_cpu& ctx,
const descriptor_t& desc,
const input_t& input) const {
return compute<Float>(ctx, desc, input);
return compute<Float, Method>(ctx, desc, input);
}
};

template struct compute_kernel_cpu<float, method_t, task_t>;
template struct compute_kernel_cpu<double, method_t, task_t>;
template struct compute_kernel_cpu<float, method::dense, task_t>;
template struct compute_kernel_cpu<double, method::dense, task_t>;
template struct compute_kernel_cpu<float, method::sparse, task_t>;
template struct compute_kernel_cpu<double, method::sparse, task_t>;

} // namespace oneapi::dal::basic_statistics::backend
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@
#pragma once

#include "oneapi/dal/algo/basic_statistics/compute_types.hpp"
#include "oneapi/dal/table/csr.hpp"
#include "oneapi/dal/backend/dispatcher.hpp"

namespace oneapi::dal::basic_statistics::backend {

template <typename Float, typename Method, typename Task>
struct compute_kernel_cpu {
using input_t = compute_input<Task>;
compute_result<Task> operator()(const dal::backend::context_cpu& ctx,
const detail::descriptor_base<Task>& params,
const compute_input<Task>& input) const;
const input_t& input) const;
};

} // namespace oneapi::dal::basic_statistics::backend
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@
#pragma once

#include "oneapi/dal/algo/basic_statistics/compute_types.hpp"
#include "oneapi/dal/table/csr.hpp"
#include "oneapi/dal/backend/dispatcher.hpp"

namespace oneapi::dal::basic_statistics::backend {

template <typename Float, typename Method, typename Task>
struct compute_kernel_gpu {
using input_t = compute_input<Task>;
compute_result<Task> operator()(const dal::backend::context_gpu& ctx,
const detail::descriptor_base<Task>& params,
const compute_input<Task>& input) const;
const input_t& input) const;
};

} // namespace oneapi::dal::basic_statistics::backend
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/*******************************************************************************
* Copyright 2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#pragma once

#include "oneapi/dal/algo/basic_statistics/backend/gpu/compute_kernel.hpp"
#include "oneapi/dal/backend/primitives/utils.hpp"
#include "oneapi/dal/table/csr.hpp"
#include "oneapi/dal/util/common.hpp"
#include "oneapi/dal/detail/policy.hpp"
#include "oneapi/dal/backend/communicator.hpp"

#ifdef ONEDAL_DATA_PARALLEL

namespace oneapi::dal::basic_statistics::backend {

namespace de = dal::detail;
namespace bk = dal::backend;
namespace pr = dal::backend::primitives;

enum stat { min, max, sum, sum2, sum2_cent, mean, moment2, variance, stddev, variation };

template <typename Float>
class compute_kernel_csr_impl {
using method_t = method::sparse;
using task_t = task::compute;
using comm_t = bk::communicator<spmd::device_memory_access::usm>;
using input_t = compute_input<task_t>;
using result_t = compute_result<task_t>;
using descriptor_t = detail::descriptor_base<task_t>;

public:
result_t operator()(const bk::context_gpu& ctx, const descriptor_t& desc, const input_t& input);

private:
// Number of different basic statistics
static constexpr std::int32_t res_opt_count_ = 10;
// An array of basic statistics
const result_option_id res_options_[res_opt_count_] = { result_options::min,
result_options::max,
result_options::sum,
result_options::sum_squares,
result_options::sum_squares_centered,
result_options::mean,
result_options::second_order_raw_moment,
result_options::variance,
result_options::standard_deviation,
result_options::variation };

result_t get_result(sycl::queue q,
const pr::ndarray<Float, 2> computed_result,
result_option_id requested_results,
const std::vector<sycl::event>& deps = {}) {
result_t res;
std::vector<sycl::event> res_events;
res.set_result_options(requested_results);
if (requested_results.test(result_options::min)) {
auto index = get_result_option_index(result_options::min);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_min(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::max)) {
auto index = get_result_option_index(result_options::max);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_max(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::sum)) {
auto index = get_result_option_index(result_options::sum);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_sum(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::sum_squares)) {
auto index = get_result_option_index(result_options::sum_squares);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_sum_squares(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::sum_squares_centered)) {
auto index = get_result_option_index(result_options::sum_squares_centered);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_sum_squares_centered(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::mean)) {
auto index = get_result_option_index(result_options::mean);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_mean(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::second_order_raw_moment)) {
auto index = get_result_option_index(result_options::second_order_raw_moment);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_second_order_raw_moment(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::variance)) {
auto index = get_result_option_index(result_options::variance);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_variance(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::standard_deviation)) {
auto index = get_result_option_index(result_options::standard_deviation);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_standard_deviation(res_table);
res_events.push_back(event);
}
if (requested_results.test(result_options::variation)) {
auto index = get_result_option_index(result_options::variation);
auto [res_table, event] = get_result_table(q, computed_result, index, deps);
res.set_variation(res_table);
res_events.push_back(event);
}
sycl::event::wait_and_throw(res_events);
return res;
}

std::tuple<table, sycl::event> get_result_table(sycl::queue q,
const pr::ndarray<Float, 2> computed_result,
std::int32_t index,
const std::vector<sycl::event>& deps = {}) {
ONEDAL_ASSERT(computed_result.has_data());
auto column_count = computed_result.get_dimension(1);
const auto arr = dal::array<Float>::empty(column_count);
const auto res_arr_ptr = arr.get_mutable_data();
const auto computed_res_ptr = computed_result.get_data() + index * column_count;
auto event =
dal::backend::copy_usm2host(q, res_arr_ptr, computed_res_ptr, column_count, deps);
return std::make_tuple(homogen_table::wrap(arr, 1, column_count), event);
}

std::int32_t get_result_option_index(result_option_id opt) {
std::int32_t index = 0;
while (!opt.test(res_options_[index]))
++index;
return index;
}

sycl::event finalize_for_distr(sycl::queue& q,
comm_t& communicator,
pr::ndarray<Float, 2>& results,
const input_t& input,
const std::vector<sycl::event>& deps);
};

} // namespace oneapi::dal::basic_statistics::backend
#endif // ONEDAL_DATA_PARALLEL
Loading

0 comments on commit d1ed4fd

Please sign in to comment.