Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to using Kokkos::Experimental::sort_by_key #1191

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,12 @@ set_target_properties(ArborX PROPERTIES INTERFACE_COMPILE_FEATURES cxx_std_17)
add_dependencies(ArborX record_hash)

include(CMakeDependentOption)
cmake_dependent_option(ARBORX_ENABLE_ROCTHRUST "Enable rocThrust support" ON "Kokkos_ENABLE_HIP" OFF)
if(Kokkos_ENABLE_HIP AND ARBORX_ENABLE_ROCTHRUST)
find_package(rocthrust REQUIRED CONFIG)
target_link_libraries(ArborX INTERFACE roc::rocthrust)
endif()

if(Kokkos_ENABLE_HIP AND NOT ARBORX_ENABLE_ROCTHRUST)
message(WARNING "rocThrust is NOT enabled.\nThis will negatively impact performance on AMD GPUs.")
if(Kokkos_ENABLE_HIP)
if (NOT Kokkos_ENABLE_ROCTHRUST)
message(WARNING "rocThrust is NOT enabled.\nThis will negatively impact performance on AMD GPUs.")
endif()
set(ARBORX_ENABLE_ROCTHRUST ${Kokkos_ENABLE_ROCTHRUST})
endif()

cmake_dependent_option(ARBORX_ENABLE_ONEDPL "Enable oneDPL support" ON "Kokkos_ENABLE_SYCL" OFF)
Expand Down
2 changes: 1 addition & 1 deletion src/cluster/ArborX_Dendrogram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ struct Dendrogram
splitEdges(exec_space, edges, unweighted_edges, _parent_heights);

Kokkos::Profiling::pushRegion("ArborX::Dendrogram::sort_edges");
KokkosExt::sortByKey(exec_space, _parent_heights, unweighted_edges);
KokkosExt::sort_by_key(exec_space, _parent_heights, unweighted_edges);
Kokkos::Profiling::popRegion();

using ConstEdges =
Expand Down
2 changes: 1 addition & 1 deletion src/distributed/detail/ArborX_DistributedTreeUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ void forwardQueriesAndCommunicateResults(
// Merge results
int const n_predicates = predicates.size();
countResults(space, n_predicates, ids, offset);
KokkosExt::sortByKey(space, ids, values);
KokkosExt::sort_by_key(space, ids, values);

Kokkos::Profiling::popRegion();
}
Expand Down
171 changes: 2 additions & 169 deletions src/kokkos_ext/ArborX_KokkosExtSort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,179 +12,12 @@
#ifndef ARBORX_KOKKOS_EXT_SORT_HPP
#define ARBORX_KOKKOS_EXT_SORT_HPP

#include <ArborX_Config.hpp> // ARBORX_ENABLE_ROCTHRUST

#include <kokkos_ext/ArborX_KokkosExtMinMaxReduce.hpp>

#include <Kokkos_Profiling_ScopedRegion.hpp>
#include <Kokkos_Macros.hpp>
#include <Kokkos_Sort.hpp>

// clang-format off
#if defined(KOKKOS_ENABLE_CUDA)
# if defined(KOKKOS_COMPILER_CLANG)

// Older Thrust (or CUB to be more precise) versions use __shfl instead of
// __shfl_sync for clang which was removed in PTX ISA version 6.4, also see
// https://github.com/NVIDIA/cub/pull/170.
#include <cub/version.cuh>
#if defined(CUB_VERSION) && (CUB_VERSION < 101100) && !defined(CUB_USE_COOPERATIVE_GROUPS)
#define CUB_USE_COOPERATIVE_GROUPS
#endif

// Some versions of Clang fail to compile Thrust, failing with errors like
// this:
// <snip>/thrust/system/cuda/detail/core/agent_launcher.h:557:11:
// error: use of undeclared identifier 'va_printf'
// The exact combination of versions for Clang and Thrust (or CUDA) for this
// failure was not investigated, however even very recent version combination
// (Clang 10.0.0 and Cuda 10.0) demonstrated failure.
//
// Defining _CubLog here allows us to avoid that code path, however disabling
// some debugging diagnostics
//
// If _CubLog is already defined, we save it into ARBORX_CubLog_save, and
// restore it at the end
# ifdef _CubLog
# define ARBORX_CubLog_save _CubLog
# endif
# define _CubLog
# include <thrust/device_ptr.h>
# include <thrust/sort.h>
# undef _CubLog
# ifdef ARBORX_CubLog_save
# define _CubLog ARBORX_CubLog_save
# undef ARBORX_CubLog_save
# endif
# else // #if defined(KOKKOS_COMPILER_CLANG)
# include <thrust/device_ptr.h>
# include <thrust/sort.h>
# endif // #if defined(KOKKOS_COMPILER_CLANG)
#endif // #if defined(KOKKOS_ENABLE_CUDA)
// clang-format on

#if defined(KOKKOS_ENABLE_HIP) && defined(ARBORX_ENABLE_ROCTHRUST)
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#endif

#if defined(KOKKOS_ENABLE_SYCL) && defined(ARBORX_ENABLE_ONEDPL)
#include <oneapi/dpl/algorithm>
#include <oneapi/dpl/execution>
#include <oneapi/dpl/iterator>
#endif

namespace ArborX::Details::KokkosExt
{

template <typename ExecutionSpace, typename Keys, typename Values>
void sortByKey(ExecutionSpace const &space, Keys &keys, Values &values)
{
Kokkos::Profiling::ScopedRegion guard("ArborX::KokkosExt::sortByKey::Kokkos");

static_assert(Kokkos::is_view<Keys>::value);
static_assert(Kokkos::is_view<Values>::value);
static_assert(Keys::rank() == 1);
static_assert(Values::rank() == 1);
static_assert(KokkosExt::is_accessible_from<typename Keys::memory_space,
ExecutionSpace>::value);
static_assert(KokkosExt::is_accessible_from<typename Values::memory_space,
ExecutionSpace>::value);
auto const n = keys.size();
ARBORX_ASSERT(values.size() == n);

if (n == 0)
return;

auto [min_val, max_val] = minmax_reduce(space, keys);
if (min_val == max_val)
return;

using SizeType = unsigned int;
using CompType = Kokkos::BinOp1D<Keys>;

Kokkos::BinSort<Keys, CompType, typename Keys::device_type, SizeType>
bin_sort(space, keys, CompType(n / 2, min_val, max_val), true);
bin_sort.create_permute_vector(space);
bin_sort.sort(space, keys);
bin_sort.sort(space, values);
}

#if defined(KOKKOS_ENABLE_CUDA) || \
(defined(KOKKOS_ENABLE_HIP) && defined(ARBORX_ENABLE_ROCTHRUST))
template <typename Keys, typename Values>
void sortByKey(
#if defined(KOKKOS_ENABLE_CUDA)
Kokkos::Cuda const &space,
#else
Kokkos::HIP const &space,
#endif
Keys &keys, Values &values)
{
Kokkos::Profiling::ScopedRegion guard("ArborX::KokkosExt::sortByKey::Thrust");

using ExecutionSpace = std::decay_t<decltype(space)>;
static_assert(Kokkos::is_view<Keys>::value);
static_assert(Kokkos::is_view<Values>::value);
static_assert(Keys::rank() == 1);
static_assert(Values::rank() == 1);
static_assert(KokkosExt::is_accessible_from<typename Keys::memory_space,
ExecutionSpace>::value);
static_assert(KokkosExt::is_accessible_from<typename Values::memory_space,
ExecutionSpace>::value);
auto const n = keys.size();
ARBORX_ASSERT(values.size() == n);

if (n == 0)
return;

#if defined(KOKKOS_ENABLE_CUDA)
auto const execution_policy = thrust::cuda::par.on(space.cuda_stream());
#else
auto const execution_policy = thrust::hip::par.on(space.hip_stream());
#endif

thrust::sort_by_key(execution_policy, keys.data(), keys.data() + n,
values.data());
}
#endif

#if defined(KOKKOS_ENABLE_SYCL) && defined(ARBORX_ENABLE_ONEDPL)
template <typename Keys, typename Values>
void sortByKey(Kokkos::Experimental::SYCL const &space, Keys &keys,
Values &values)
{
Kokkos::Profiling::ScopedRegion guard("ArborX::KokkosExt::sortByKey::OneDPL");

using ExecutionSpace = std::decay_t<decltype(space)>;
static_assert(Kokkos::is_view<Keys>::value);
static_assert(Kokkos::is_view<Values>::value);
static_assert(Keys::rank() == 1);
static_assert(Values::rank() == 1);
static_assert(KokkosExt::is_accessible_from<typename Keys::memory_space,
ExecutionSpace>::value);
static_assert(KokkosExt::is_accessible_from<typename Values::memory_space,
ExecutionSpace>::value);
auto const n = keys.size();
ARBORX_ASSERT(values.size() == n);

if (n == 0)
return;

oneapi::dpl::execution::device_policy policy(
*space.impl_internal_space_instance()->m_queue);
#if ONEDPL_VERSION_MAJOR > 2022 || \
(ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)
oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data());
#else
auto zipped_begin =
oneapi::dpl::make_zip_iterator(keys.data(), values.data());
oneapi::dpl::sort(
policy, zipped_begin, zipped_begin + n,
[](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
#endif
using Kokkos::Experimental::sort_by_key;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a good reason to keep the alias?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. This way, we contain the changes in this PR to a single file. Otherwise, we would have to do the changes all over twice: in this PR, and once sort_by_key moves out of Experimental.

}
#endif

} // namespace ArborX::Details::KokkosExt

#endif
2 changes: 1 addition & 1 deletion src/misc/ArborX_SortUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ auto sortObjects(ExecutionSpace const &space, ViewType &view)
view.extent(0));
KokkosExt::iota(space, permute);

KokkosExt::sortByKey(space, view, permute);
KokkosExt::sort_by_key(space, view, permute);

Kokkos::Profiling::popRegion();

Expand Down
Loading