Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization of __serial_merge function #1970

Merged
merged 18 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
be021ac
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 17, 2024
d3d863d
Specify __serial_merge by unroll factor template param
SergeyKopienko Dec 17, 2024
6bfe0ee
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
726bd2e
Apply GitHUB clang format
SergeyKopienko Dec 18, 2024
ab54931
Remove unroll from __serial_merge
SergeyKopienko Dec 18, 2024
764a494
Merge branch 'dev/skopienko/serial_merge_without_unroll' into dev/sko…
SergeyKopienko Dec 18, 2024
b30d296
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
288784a
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
2d065ac
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
cafc522
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
bf18ec8
Fix review comment: change __chunk parameter type of __serial_merge t…
SergeyKopienko Dec 18, 2024
879b5d2
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
00421d4
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
3e94b36
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
f82443f
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h - …
SergeyKopienko Dec 18, 2024
10d0dd1
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort…
SergeyKopienko Dec 18, 2024
cbba810
include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort…
SergeyKopienko Dec 18, 2024
a43f427
Merge branch 'main' into dev/skopienko/serial_merge
SergeyKopienko Dec 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 29 additions & 44 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,52 +80,35 @@ __find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_el

// Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing
// to rng3 (starting from start3) in 'chunk' steps, but do not exceed the total size of the sequences (n1 and n2)
template <typename _Rng1, typename _Rng2, typename _Rng3, typename _Index, typename _Compare>
template <unsigned int _UnrollFactor = 4, typename _Rng1, typename _Rng2, typename _Rng3, typename _Index, typename _Compare>
void
__serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, _Index __start1, _Index __start2,
const _Index __start3, const std::uint8_t __chunk, const _Index __n1, const _Index __n2, _Compare __comp)
{
if (__start1 >= __n1)
const _Index __rng1_size = std::min<_Index>(__n1 > __start1 ? __n1 - __start1 : (_Index)0, (_Index)__chunk);
const _Index __rng2_size = std::min<_Index>(__n2 > __start2 ? __n2 - __start2 : (_Index)0, (_Index)__chunk);
const _Index __rng3_size = std::min<_Index>(__rng1_size + __rng2_size, (_Index)__chunk);
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
akukanov marked this conversation as resolved.
Show resolved Hide resolved

const auto __rng1_idx_end = __start1 + __rng1_size;
const auto __rng2_idx_end = __start2 + __rng2_size;
const auto __rng3_idx_end = __start3 + __rng3_size;
dmitriy-sobolev marked this conversation as resolved.
Show resolved Hide resolved

_Index __rng1_idx = __start1;
_Index __rng2_idx = __start2;
_Index __rng3_idx = __start3;
dmitriy-sobolev marked this conversation as resolved.
Show resolved Hide resolved

bool __rng1_idx_less__n1, __rng2_idx_less__n2;
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved

#pragma unroll _UnrollFactor
for (_Index __rng3_idx = __start3; __rng3_idx < __rng3_idx_end; ++__rng3_idx)
{
//copying a residual of the second seq
const _Index __n = std::min<_Index>(__n2 - __start2, __chunk);
for (std::uint8_t __i = 0; __i < __n; ++__i)
__rng3[__start3 + __i] = __rng2[__start2 + __i];
}
else if (__start2 >= __n2)
{
//copying a residual of the first seq
const _Index __n = std::min<_Index>(__n1 - __start1, __chunk);
for (std::uint8_t __i = 0; __i < __n; ++__i)
__rng3[__start3 + __i] = __rng1[__start1 + __i];
}
else
{
for (std::uint8_t __i = 0; __i < __chunk && __start1 < __n1 && __start2 < __n2; ++__i)
{
const auto& __val1 = __rng1[__start1];
const auto& __val2 = __rng2[__start2];
if (__comp(__val2, __val1))
{
__rng3[__start3 + __i] = __val2;
if (++__start2 == __n2)
{
//copying a residual of the first seq
for (++__i; __i < __chunk && __start1 < __n1; ++__i, ++__start1)
__rng3[__start3 + __i] = __rng1[__start1];
}
}
else
{
__rng3[__start3 + __i] = __val1;
if (++__start1 == __n1)
{
//copying a residual of the second seq
for (++__i; __i < __chunk && __start2 < __n2; ++__i, ++__start2)
__rng3[__start3 + __i] = __rng2[__start2];
}
}
}
__rng1_idx_less__n1 = __rng1_idx < __rng1_idx_end;
__rng2_idx_less__n2 = __rng2_idx < __rng2_idx_end;

__rng3[__rng3_idx] =
__rng1_idx_less__n1 && __rng2_idx_less__n2
? (__comp(__rng2[__rng2_idx], __rng1[__rng1_idx]) ? __rng2[__rng2_idx++] : __rng1[__rng1_idx++])
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
: (__rng1_idx_less__n1 ? __rng1[__rng1_idx++] : __rng2[__rng2_idx++]);
}
}

Expand All @@ -136,6 +119,8 @@ struct __parallel_merge_submitter;
template <typename _IdType, typename... _Name>
struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_Name...>>
{
static constexpr std::uint32_t __gpu_chunk = 4;

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare>
auto
operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const
Expand All @@ -149,7 +134,7 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N
_PRINT_INFO_IN_DEBUG_MODE(__exec);

// Empirical number of values to process per work-item
const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4;
const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : __gpu_chunk;

const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk);

Expand All @@ -158,8 +143,8 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N
__cgh.parallel_for<_Name...>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
const _IdType __i_elem = __item_id.get_linear_id() * __chunk;
const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp);
__serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2,
__comp);
__serial_merge<__gpu_chunk>(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk,
__n1, __n2, __comp);
});
});
return __future(__event);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ struct __merge_sort_global_submitter;
template <typename _IndexT, typename... _GlobalSortName>
struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name<_GlobalSortName...>>
{
static constexpr std::uint32_t __gpu_chunk = 4;

template <typename _Range, typename _Compare, typename _TempBuf, typename _LeafSizeT>
std::pair<sycl::event, bool>
operator()(sycl::queue& __q, _Range& __rng, _Compare __comp, _LeafSizeT __leaf_size, _TempBuf& __temp_buf,
Expand All @@ -241,7 +243,7 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
const _IndexT __n = __rng.size();
_IndexT __n_sorted = __leaf_size;
const bool __is_cpu = __q.get_device().is_cpu();
const std::uint32_t __chunk = __is_cpu ? 32 : 4;
const std::uint32_t __chunk = __is_cpu ? 32 : __gpu_chunk;
const std::size_t __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk);
bool __data_in_temp = false;

Expand Down Expand Up @@ -272,17 +274,17 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name
const oneapi::dpl::__ranges::drop_view_simple __rng2(__dst, __offset + __n1);

const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp);
__serial_merge(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second, __i_elem,
__chunk, __n1, __n2, __comp);
__serial_merge<__gpu_chunk>(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second,
mmichel11 marked this conversation as resolved.
Show resolved Hide resolved
__i_elem, __chunk, __n1, __n2, __comp);
}
else
{
const oneapi::dpl::__ranges::drop_view_simple __rng1(__rng, __offset);
const oneapi::dpl::__ranges::drop_view_simple __rng2(__rng, __offset + __n1);

const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp);
__serial_merge(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second, __i_elem,
__chunk, __n1, __n2, __comp);
__serial_merge<__gpu_chunk>(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second,
__i_elem, __chunk, __n1, __n2, __comp);
}
});
});
Expand Down
Loading