Skip to content

Commit

Permalink
Merge branch 'main' into dev/asolovev_onemkl_enabling_clear
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexandr-Solovev authored Aug 30, 2024
2 parents 40e0641 + ffed86d commit 33c8167
Show file tree
Hide file tree
Showing 44 changed files with 634 additions and 588 deletions.
2 changes: 0 additions & 2 deletions .ci/env/apt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ function install_mkl {

function install_clang-format {
sudo apt-get install -y clang-format-14
sudo update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-14 100
sudo update-alternatives --set clang-format /usr/bin/clang-format-14
}

function install_dev-base {
Expand Down
4 changes: 3 additions & 1 deletion .ci/scripts/clang-format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ echo "Starting format check..."

RETURN_CODE=0

CLANG_FORMAT_EXE=${CLANG_FORMAT_EXE:-clang-format-14}

for sources_path in cpp/daal cpp/oneapi examples/oneapi examples/daal samples/oneapi samples/daal; do
pushd ${sources_path} || exit 1
for filename in $(find . -type f | grep -P ".*\.(c|cpp|h|hpp|cl|i)$"); do clang-format -style=file -i "${filename}"; done
for filename in $(find . -type f | grep -P ".*\.(c|cpp|h|hpp|cl|i)$"); do ${CLANG_FORMAT_EXE} -style=file -i "${filename}"; done

git status | grep "nothing to commit" > /dev/null

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/renovate-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ jobs:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4
- name: Validate
uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.0.1
uses: suzuki-shunsuke/github-action-renovate-config-validator@v1.1.0
with:
config_file_path: .github/renovate.json
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,9 @@ Status PredictClassificationTask<algorithmFPType, cpu>::predictAllPointsByAllTre
ReadRows<algorithmFPType, cpu> xBD(const_cast<NumericTable *>(_data), 0, nRowsOfRes);
DAAL_CHECK_BLOCK_STATUS(xBD);
const algorithmFPType * const aX = xBD.get();
if (numberOfTrees > _minTreesForThreading)
// TODO: investigate why higher level parallelism for trees causes performance degradation
// (excessive memory and CPU resources usage), especially on systems with high number of cores
if (false)
{
daal::static_tls<algorithmFPType *> tlsData([=]() { return service_scalable_calloc<algorithmFPType, cpu>(_nClasses * nRowsOfRes); });

Expand Down
16 changes: 14 additions & 2 deletions cpp/daal/src/services/compiler/generic/env_detect_features.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
#if defined(TARGET_X86_64)
#include <immintrin.h>
#elif defined(TARGET_ARM)
#include <arm_sve.h>
#include <sys/auxv.h>
#include <asm/hwcap.h>
#elif defined(TARGET_RISCV64)
// TODO: Include vector if and when we need to use some vector intrinsics in
// here
Expand Down Expand Up @@ -218,14 +219,25 @@ DAAL_EXPORT int __daal_serv_cpu_detect(int enable)
return daal::sse2;
}
#elif defined(TARGET_ARM)
static bool check_sve_features()
{
unsigned long hwcap = getauxval(AT_HWCAP);

return (hwcap & HWCAP_SVE) != 0;
}

DAAL_EXPORT bool __daal_serv_cpu_extensions_available()
{
return 0;
}

DAAL_EXPORT int __daal_serv_cpu_detect(int enable)
{
return daal::sve;
if (check_sve_features())
{
return daal::sve;
}
return -1;
}

void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t * abcd)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,13 @@ result_t compute_kernel_csr_impl<Float>::operator()(const bk::context_gpu& ctx,
if (row_count != cur_row_count) {
auto cur_min = result_data_ptr[stat::min * column_count + col_idx];
auto cur_max = result_data_ptr[stat::max * column_count + col_idx];
#if __SYCL_COMPILER_VERSION >= 20240715
result_data_ptr[stat::min * column_count + col_idx] = Float(sycl::fmin(cur_min, 0));
result_data_ptr[stat::max * column_count + col_idx] = Float(sycl::fmax(cur_max, 0));
#else
result_data_ptr[stat::min * column_count + col_idx] = sycl::min<Float>(cur_min, 0);
result_data_ptr[stat::max * column_count + col_idx] = sycl::max<Float>(cur_max, 0);
#endif
cur_sum2_cent += Float(row_count - cur_row_count) * mean_val * mean_val;
}
result_data_ptr[stat::sum2_cent * column_count + col_idx] = cur_sum2_cent;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,11 @@ inline void merge_blocks_kernel(sycl::nd_item<1> item,

if constexpr (!DefferedFin) {
Float mrgvariance = mrgsum2cent / (mrgvectors - Float(1));
#if __SYCL_COMPILER_VERSION >= 20240715
Float mrgstdev = (Float)sycl::sqrt(mrgvariance);
#else
Float mrgstdev = (Float)sqrt(mrgvariance);
#endif

if constexpr (check_mask_flag(bs_list::sorm, List)) {
rsorm_ptr[group_id] = mrgsum2 / mrgvectors;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

#include "oneapi/dal/algo/basic_statistics/backend/basic_statistics_interop.hpp"

#ifdef ONEDAL_DATA_PARALLEL

namespace oneapi::dal::basic_statistics::backend {

namespace bk = dal::backend;
Expand Down Expand Up @@ -151,16 +153,21 @@ result_t finalize_compute_kernel_dense_impl<Float>::operator()(const descriptor_
const auto nobs_nd = pr::table2ndarray_1d<Float>(q, input.get_partial_n_rows());

auto rows_count_global = nobs_nd.get_data()[0];
auto is_distributed = (comm_.get_rank_count() > 1);
{
ONEDAL_PROFILER_TASK(allreduce_rows_count_global);
comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait();
if (is_distributed) {
comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait();
}
}
if (res_op.test(result_options::min)) {
ONEDAL_ASSERT(input.get_partial_min().get_column_count() == column_count);
const auto min =
pr::table2ndarray_1d<Float>(q, input.get_partial_min(), sycl::usm::alloc::device);

{ comm_.allreduce(min.flatten(q, {}), spmd::reduce_op::min).wait(); }
if (is_distributed) {
comm_.allreduce(min.flatten(q, {}), spmd::reduce_op::min).wait();
}
res.set_min(homogen_table::wrap(min.flatten(q, {}), 1, column_count));
}

Expand All @@ -174,27 +181,48 @@ result_t finalize_compute_kernel_dense_impl<Float>::operator()(const descriptor_
}

if (res_op_partial.test(result_options::sum)) {
const auto sums_nd =
auto sums_nd =
pr::table2ndarray_1d<Float>(q, input.get_partial_sum(), sycl::usm::alloc::device);
{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums_nd.flatten(q, {}), spmd::reduce_op::sum).wait();
}
const auto sums2_nd = pr::table2ndarray_1d<Float>(q,
input.get_partial_sum_squares(),
sycl::usm::alloc::device);
{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums2_nd.flatten(q, {}), spmd::reduce_op::sum).wait();
}
const auto sums2cent_nd =
pr::table2ndarray_1d<Float>(q,
input.get_partial_sum_squares_centered(),
sycl::usm::alloc::device);
{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums2cent_nd.flatten(q, {}), spmd::reduce_op::sum).wait();
auto sums2_nd = pr::table2ndarray_1d<Float>(q,
input.get_partial_sum_squares(),
sycl::usm::alloc::device);

auto sums2cent_nd = pr::table2ndarray_1d<Float>(q,
input.get_partial_sum_squares_centered(),
sycl::usm::alloc::device);
if (is_distributed) {
auto sums_nd_copy =
pr::ndarray<Float, 1>::empty(q, { column_count }, sycl::usm::alloc::device);
auto copy_event = copy(q, sums_nd_copy, sums_nd, {});
copy_event.wait_and_throw();
sums_nd = sums_nd_copy;

{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums_nd.flatten(q, {}), spmd::reduce_op::sum).wait();
}

auto sums2_nd_copy =
pr::ndarray<Float, 1>::empty(q, { column_count }, sycl::usm::alloc::device);
copy_event = copy(q, sums2_nd_copy, sums2_nd, {});
copy_event.wait_and_throw();
sums2_nd = sums2_nd_copy;

{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums2_nd.flatten(q, {}), spmd::reduce_op::sum).wait();
}
auto sums2cent_nd_copy =
pr::ndarray<Float, 1>::empty(q, { column_count }, sycl::usm::alloc::device);
copy_event = copy(q, sums2cent_nd_copy, sums2cent_nd, {});
copy_event.wait_and_throw();
sums2cent_nd = sums2cent_nd_copy;
{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums2cent_nd.flatten(q, {}), spmd::reduce_op::sum).wait();
}
}

auto [result_means,
result_variance,
result_raw_moment,
Expand All @@ -210,18 +238,20 @@ result_t finalize_compute_kernel_dense_impl<Float>::operator()(const descriptor_

if (res_op.test(result_options::sum)) {
ONEDAL_ASSERT(input.get_partial_sum().get_column_count() == column_count);
res.set_sum(input.get_partial_sum());
res.set_sum(homogen_table::wrap(sums_nd.flatten(q, { update_event }), 1, column_count));
}

if (res_op.test(result_options::sum_squares)) {
ONEDAL_ASSERT(input.get_partial_sum_squares().get_column_count() == column_count);
res.set_sum_squares(input.get_partial_sum_squares());
res.set_sum_squares(
homogen_table::wrap(sums2_nd.flatten(q, { update_event }), 1, column_count));
}

if (res_op.test(result_options::sum_squares_centered)) {
ONEDAL_ASSERT(input.get_partial_sum_squares_centered().get_column_count() ==
column_count);
res.set_sum_squares_centered(input.get_partial_sum_squares_centered());
res.set_sum_squares_centered(
homogen_table::wrap(sums2cent_nd.flatten(q, { update_event }), 1, column_count));
}

if (res_op.test(result_options::mean)) {
Expand Down Expand Up @@ -264,3 +294,5 @@ template class finalize_compute_kernel_dense_impl<float>;
template class finalize_compute_kernel_dense_impl<double>;

} // namespace oneapi::dal::basic_statistics::backend

#endif // ONEDAL_DATA_PARALLEL
10 changes: 6 additions & 4 deletions cpp/oneapi/dal/algo/basic_statistics/test/online_spmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ class basic_statistics_online_spmd_test
}
partial_results.push_back(partial_result);
}
const auto compute_result = this->finalize_compute_override(bs_desc, partial_results);

auto compute_result = this->finalize_compute_override(bs_desc, partial_results);
base_t::check_compute_result(compute_mode, data, weights, compute_result);
compute_result = this->finalize_compute_override(bs_desc, partial_results);
base_t::check_compute_result(compute_mode, data, weights, compute_result);
}
else {
Expand All @@ -103,8 +104,9 @@ class basic_statistics_online_spmd_test
}
partial_results.push_back(partial_result);
}
const auto compute_result = this->finalize_compute_override(bs_desc, partial_results);

auto compute_result = this->finalize_compute_override(bs_desc, partial_results);
base_t::check_compute_result(compute_mode, data, table{}, compute_result);
compute_result = this->finalize_compute_override(bs_desc, partial_results);
base_t::check_compute_result(compute_mode, data, table{}, compute_result);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,29 +66,39 @@ result_t finalize_compute_kernel_dense_impl<Float>::operator()(const descriptor_

const auto nobs_host = pr::table2ndarray<Float>(q, input.get_partial_n_rows());
auto rows_count_global = nobs_host.get_data()[0];
{
ONEDAL_PROFILER_TASK(allreduce_rows_count_global);
comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait();
}

ONEDAL_ASSERT(rows_count_global > 0);

const auto sums =
pr::table2ndarray_1d<Float>(q, input.get_partial_sum(), sycl::usm::alloc::device);

{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait();
}

const auto xtx =
auto sums = pr::table2ndarray_1d<Float>(q, input.get_partial_sum(), sycl::usm::alloc::device);
auto xtx =
pr::table2ndarray<Float>(q, input.get_partial_crossproduct(), sycl::usm::alloc::device);

{
ONEDAL_PROFILER_TASK(allreduce_xtx, q);
comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait();
if (comm_.get_rank_count() > 1) {
{
ONEDAL_PROFILER_TASK(allreduce_rows_count_global);
comm_.allreduce(rows_count_global, spmd::reduce_op::sum).wait();
}
auto sums_copy =
pr::ndarray<Float, 1>::empty(q, { column_count }, sycl::usm::alloc::device);
auto copy_event = copy(q, sums_copy, sums, {});
copy_event.wait_and_throw();
sums = sums_copy;
{
ONEDAL_PROFILER_TASK(allreduce_sums, q);
comm_.allreduce(sums.flatten(q, {}), spmd::reduce_op::sum).wait();
}

auto xtx_copy = pr::ndarray<Float, 2>::empty(q,
{ column_count, column_count },
sycl::usm::alloc::device);
copy_event = copy(q, xtx_copy, xtx, {});
copy_event.wait_and_throw();
xtx = xtx_copy;
{
ONEDAL_PROFILER_TASK(allreduce_xtx, q);
comm_.allreduce(xtx.flatten(q, {}), spmd::reduce_op::sum).wait();
}
}

ONEDAL_ASSERT(rows_count_global > 0);

if (desc.get_result_options().test(result_options::cov_matrix)) {
auto [cov, cov_event] =
compute_covariance(q, rows_count_global, xtx, sums, bias, assume_centered);
Expand All @@ -101,8 +111,17 @@ result_t finalize_compute_kernel_dense_impl<Float>::operator()(const descriptor_
(homogen_table::wrap(corr.flatten(q, { corr_event }), column_count, column_count)));
}
if (desc.get_result_options().test(result_options::means)) {
auto [means, means_event] = compute_means(q, sums, rows_count_global);
result.set_means(homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count));
if (!assume_centered) {
auto [means, means_event] = compute_means(q, sums, rows_count_global);
result.set_means(
homogen_table::wrap(means.flatten(q, { means_event }), 1, column_count));
}
else {
auto [zero_means, zeros_event] =
pr::ndarray<Float, 1>::zeros(q, { column_count }, sycl::usm::alloc::device);
result.set_means(
homogen_table::wrap(zero_means.flatten(q, { zeros_event }), 1, column_count));
}
}
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,14 @@ static partial_compute_result<Task> partial_compute(const context_gpu& ctx,
const std::int64_t column_count = data.get_column_count();
ONEDAL_ASSERT(column_count > 0);

auto assume_centered = desc.get_assume_centered();

dal::detail::check_mul_overflow(row_count, column_count);
dal::detail::check_mul_overflow(column_count, column_count);

const auto data_nd = pr::table2ndarray<Float>(q, data, sycl::usm::alloc::device);

auto [sums, sums_event] = compute_sums(q, data_nd);
auto [sums, sums_event] = compute_sums(q, data_nd, assume_centered, {});

auto [crossproduct, crossproduct_event] = compute_crossproduct(q, data_nd, { sums_event });

Expand Down
2 changes: 1 addition & 1 deletion cpp/oneapi/dal/algo/covariance/test/badargs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ using cov_types = COMBINE_TYPES((float, double),
(covariance::task::compute));

#define COVARIANCE_BADARG_TEST(name) \
TEMPLATE_TEST_M(covariance_badarg_test, name, "[covariance][badarg]", cov_types)
TEMPLATE_LIST_TEST_M(covariance_badarg_test, name, "[covariance][badarg]", cov_types)

COVARIANCE_BADARG_TEST("throws if input data is empty") {
const auto covariance_desc = this->get_descriptor();
Expand Down
Loading

0 comments on commit 33c8167

Please sign in to comment.