Skip to content

Commit

Permalink
Merge branch 'main' into dev/skopienko/optimize_merge_sort_V1
Browse files Browse the repository at this point in the history
  • Loading branch information
SergeyKopienko committed Dec 23, 2024
2 parents 14ec793 + c2a226b commit befb909
Show file tree
Hide file tree
Showing 11 changed files with 208 additions and 121 deletions.
8 changes: 4 additions & 4 deletions include/oneapi/dpl/internal/scan_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,11 +164,11 @@ struct __sycl_scan_by_segment_impl

__dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);

#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__seg_scan_wg_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegScanWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__seg_scan_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -268,11 +268,11 @@ struct __sycl_scan_by_segment_impl
__dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh);

__dpl_sycl::__local_accessor<bool> __loc_seg_ends_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__seg_scan_prefix_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegScanPrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__seg_scan_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down
8 changes: 4 additions & 4 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,11 +326,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
auto __temp_acc = __result_and_scratch.template __get_scratch_acc<sycl::access_mode::write>(
__cgh, __dpl_sycl::__no_init{});
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle());
#endif
__cgh.parallel_for<_LocalScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__kernel_1,
#endif
sycl::nd_range<1>(__n_groups * __wgroup_size, __wgroup_size), [=](sycl::nd_item<1> __item) {
Expand All @@ -347,11 +347,11 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
__cgh.depends_on(__submit_event);
auto __temp_acc = __result_and_scratch.template __get_scratch_acc<sycl::access_mode::read_write>(__cgh);
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle());
#endif
__cgh.parallel_for<_GroupScanKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__kernel_2,
#endif
// TODO: try to balance work between several workgroups instead of one
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,11 @@ __radix_sort_count_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments, :
oneapi::dpl::__ranges::__require_access(__hdl, __val_rng, __count_rng);
// an accessor per work-group with value counters from each work-item
auto __count_lacc = __dpl_sycl::__local_accessor<_CountT>(__wg_size * __radix_states, __hdl);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
sycl::nd_range<1>(__segments * __wg_size, __wg_size), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -299,11 +299,11 @@ __radix_sort_scan_submit(_ExecutionPolicy&& __exec, ::std::size_t __scan_wg_size
__hdl.depends_on(__dependency_event);
// access the counters for all work groups
oneapi::dpl::__ranges::__require_access(__hdl, __count_rng);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
sycl::nd_range<1>(__radix_states * __scan_wg_size, __scan_wg_size), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -346,7 +346,7 @@ enum class __peer_prefix_algo
template <std::uint32_t __radix_states, typename _OffsetT, __peer_prefix_algo _Algo>
struct __peer_prefix_helper;

#if (_ONEDPL_LIBSYCL_VERSION >= 50700)
#if _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT
template <std::uint32_t __radix_states, typename _OffsetT>
struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::atomic_fetch_or>
{
Expand Down Expand Up @@ -390,7 +390,7 @@ struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::atomic
return __offset;
}
};
#endif // (_ONEDPL_LIBSYCL_VERSION >= 50700)
#endif // _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT

template <std::uint32_t __radix_states, typename _OffsetT>
struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::scan_then_broadcast>
Expand Down Expand Up @@ -428,7 +428,7 @@ struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::scan_t
}
};

#if _ONEDPL_SYCL_SUB_GROUP_MASK_PRESENT
#if _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT
template <std::uint32_t __radix_states, typename _OffsetT>
struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::subgroup_ballot>
{
Expand Down Expand Up @@ -468,7 +468,7 @@ struct __peer_prefix_helper<__radix_states, _OffsetT, __peer_prefix_algo::subgro
return __offset;
}
};
#endif // _ONEDPL_SYCL_SUB_GROUP_MASK_PRESENT
#endif // _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT

template <typename _InRange, typename _OutRange>
void
Expand Down Expand Up @@ -544,11 +544,11 @@ __radix_sort_reorder_submit(_ExecutionPolicy&& __exec, ::std::size_t __segments,

typename _PeerHelper::_TempStorageT __peer_temp(1, __hdl);

#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__hdl.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__hdl.parallel_for<_KernelName>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
//Each SYCL work group processes one data segment.
Expand Down Expand Up @@ -726,13 +726,13 @@ struct __parallel_radix_sort_iteration
sycl::event __reorder_event{};
if (__reorder_sg_size == 8 || __reorder_sg_size == 16 || __reorder_sg_size == 32)
{
#if _ONEDPL_SYCL_SUB_GROUP_MASK_PRESENT
#if _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT
constexpr auto __peer_algorithm = __peer_prefix_algo::subgroup_ballot;
#elif _ONEDPL_LIBSYCL_VERSION >= 50700
#elif _ONEDPL_SYCL2020_SUBGROUP_BARRIER_PRESENT
constexpr auto __peer_algorithm = __peer_prefix_algo::atomic_fetch_or;
#else
constexpr auto __peer_algorithm = __peer_prefix_algo::scan_then_broadcast;
#endif // _ONEDPL_SYCL_SUB_GROUP_MASK_PRESENT
#endif // _ONEDPL_LIBSYCL_SUB_GROUP_MASK_PRESENT

__reorder_event =
__radix_sort_reorder_submit<_RadixReorderPeerKernel, __radix_bits, __is_ascending, __peer_algorithm>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,11 @@ struct __parallel_transform_reduce_impl
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_ReduceKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__kernel,
#endif
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,12 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __keys);
auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::write>(__cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceCountKernel>(
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__seg_reduce_count_kernel,
#endif
sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -202,11 +202,11 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
__cgh.depends_on(__seg_end_identification);
auto __seg_ends_acc = __seg_ends.template get_access<sycl::access_mode::read>(__cgh);
auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read_write>(__cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceOffsetKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__seg_reduce_offset_kernel,
#endif
sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand All @@ -225,11 +225,11 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
auto __partials_acc = __partials.template get_access<sycl::access_mode::read_write>(__cgh);
auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access<sycl::access_mode::read>(__cgh);
__dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReduceWgKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__seg_reduce_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down Expand Up @@ -348,11 +348,11 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_
__dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh);

__cgh.depends_on(__wg_reduce);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle());
#endif
__cgh.parallel_for<_SegReducePrefixKernel>(
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
__seg_reduce_prefix_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
Expand Down
23 changes: 10 additions & 13 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,12 @@ __supports_sub_group_size(const _ExecutionPolicy& __exec, std::size_t __target_s
// Kernel run-time information helpers
//-----------------------------------------------------------------------------

// 20201214 value corresponds to Intel(R) oneAPI C++ Compiler Classic 2021.1.2 Patch release
#define _USE_KERNEL_DEVICE_SPECIFIC_API (__SYCL_COMPILER_VERSION > 20201214) || (_ONEDPL_LIBSYCL_VERSION >= 50700)

template <typename _ExecutionPolicy>
::std::size_t
__kernel_work_group_size(const _ExecutionPolicy& __policy, const sycl::kernel& __kernel)
{
const sycl::device& __device = __policy.queue().get_device();
#if _USE_KERNEL_DEVICE_SPECIFIC_API
#if _ONEDPL_SYCL2020_KERNEL_DEVICE_API_PRESENT
return __kernel.template get_info<sycl::info::kernel_device_specific::work_group_size>(__device);
#else
return __kernel.template get_work_group_info<sycl::info::kernel_work_group::work_group_size>(__device);
Expand All @@ -130,10 +127,10 @@ __kernel_sub_group_size(const _ExecutionPolicy& __policy, const sycl::kernel& __
const sycl::device& __device = __policy.queue().get_device();
[[maybe_unused]] const ::std::size_t __wg_size = __kernel_work_group_size(__policy, __kernel);
const ::std::uint32_t __sg_size =
#if _USE_KERNEL_DEVICE_SPECIFIC_API
#if _ONEDPL_SYCL2020_KERNEL_DEVICE_API_PRESENT
__kernel.template get_info<sycl::info::kernel_device_specific::max_sub_group_size>(
__device
# if _ONEDPL_LIBSYCL_VERSION < 60000
# if _ONEDPL_LIBSYCL_VERSION_LESS_THAN(60000)
,
sycl::range<3> { __wg_size, 1, 1 }
# endif
Expand Down Expand Up @@ -267,7 +264,7 @@ class __kernel_compiler
static_assert(__kernel_count > 0, "At least one kernel name should be provided");

public:
#if _ONEDPL_KERNEL_BUNDLE_PRESENT
#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
template <typename _Exec>
static auto
__compile(_Exec&& __exec)
Expand Down Expand Up @@ -551,13 +548,13 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
inline bool
__use_USM_host_allocations(sycl::queue __queue)
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if _ONEDPL_SYCL2020_DEFAULT_ACCESSOR_CONSTRUCTOR_PRESENT && _ONEDPL_SYCL_L0_EXT_PRESENT
auto __device = __queue.get_device();
if (!__device.is_gpu())
return false;
if (!__device.has(sycl::aspect::usm_host_allocations))
return false;
if (__device.get_backend() != sycl::backend::ext_oneapi_level_zero)
if (__device.get_backend() != __dpl_sycl::__level_zero_backend)
return false;
return true;
#else
Expand All @@ -568,7 +565,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
inline bool
__use_USM_allocations(sycl::queue __queue)
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if _ONEDPL_SYCL2020_DEFAULT_ACCESSOR_CONSTRUCTOR_PRESENT
return __queue.get_device().has(sycl::aspect::usm_device_allocations);
#else
return false;
Expand Down Expand Up @@ -621,7 +618,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
static auto
__get_usm_or_buffer_accessor_ptr(const _Acc& __acc, std::size_t __scratch_n = 0)
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if _ONEDPL_SYCL2020_DEFAULT_ACCESSOR_CONSTRUCTOR_PRESENT
return __acc.__get_pointer();
#else
return &__acc[__scratch_n];
Expand All @@ -632,7 +629,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
auto
__get_result_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if _ONEDPL_SYCL2020_DEFAULT_ACCESSOR_CONSTRUCTOR_PRESENT
if (__use_USM_host && __supports_USM_device)
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __result_buf.get(), __prop_list);
else if (__supports_USM_device)
Expand All @@ -648,7 +645,7 @@ struct __result_and_scratch_storage : __result_and_scratch_storage_base
auto
__get_scratch_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
#if _ONEDPL_SYCL2020_DEFAULT_ACCESSOR_CONSTRUCTOR_PRESENT
if (__use_USM_host || __supports_USM_device)
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __scratch_buf.get(), __prop_list);
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __sycl_buf.get(), __prop_list);
Expand Down
Loading

0 comments on commit befb909

Please sign in to comment.