diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index cadff26a15d..36860f2d449 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -31,6 +31,8 @@ namespace dpl { namespace __par_backend_hetero { +template +using _split_point_t = std::pair<_Index, _Index>; //Searching for an intersection of a merge matrix (n1, n2) diagonal with the Merge Path to define sub-ranges //to serial merge. For example, a merge matrix for [0,1,1,2,3] and [0,0,2,3] is shown below: @@ -45,37 +47,87 @@ namespace __par_backend_hetero // | ----> // 3 | 0 0 0 0 0 | template -auto -__find_start_point(const _Rng1& __rng1, const _Rng2& __rng2, const _Index __i_elem, const _Index __n1, - const _Index __n2, _Compare __comp) +_split_point_t<_Index> +__find_start_point(const _Rng1& __rng1, const _Index __rng1_from, _Index __rng1_to, const _Rng2& __rng2, + const _Index __rng2_from, _Index __rng2_to, const _Index __i_elem, _Compare __comp) { - //searching for the first '1', a lower bound for a diagonal [0, 0,..., 0, 1, 1,.... 1, 1] - oneapi::dpl::counting_iterator<_Index> __diag_it(0); + // ----------------------- EXAMPLE ------------------------ + // Let's consider the following input data: + // rng1.size() = 10 + // rng2.size() = 6 + // i_diag = 9 + // Let's define the following ranges for processing: + // rng1: [3, ..., 9) -> __rng1_from = 3, __rng1_to = 9 + // rng2: [1, ..., 4) -> __rng2_from = 1, __rng2_to = 4 + // + // The goal: required to process only X' items of the merge matrix + // as intersection of rng1[3, ..., 9) and rng2[1, ..., 4) + // + // -------------------------------------------------------- + // + // __diag_it_begin(rng1) __diag_it_end(rng1) + // (init state) (dest state) (init state, dest state) + // | | | + // V V V + // + + + + + + + // \ rng1 0 1 2 3 4 5 6 7 8 9 + // rng2 +--------------------------------------+ + // 0 | ^ ^ ^ X | <--- __diag_it_end(rng2) (init state) + // + 1 | <----------------- + + X'2 ^ | <--- __diag_it_end(rng2) (dest state) + // + 2 | <----------------- + X'1 | | + // + 3 | <----------------- X'0 | | <--- __diag_it_begin(rng2) (dest state) + // 4 | X ^ | | + // 5 | X | | | <--- __diag_it_begin(rng2) (init state) + // +-------AX-----------+-----------+-----+ + // AX | | + // AX | | + // Run lower_bound:[from = 5, to = 8) + // + // AX - absent items in rng2 + // + // We have three points on diagonal for call comparison: + // X'0 : call __comp(rng1[5], rng2[3]) // 5 + 3 == 9 - 1 == 8 + // X'1 : call __comp(rng1[6], rng2[2]) // 6 + 2 == 9 - 1 == 8 + // X'3 : call __comp(rng1[7], rng2[1]) // 7 + 1 == 9 - 1 == 8 + // - where for every comparing pairs idx(rng1) + idx(rng2) == i_diag - 1 - if (__i_elem < __n2) //a condition to specify upper or lower part of the merge matrix to be processed - { - const _Index __q = __i_elem; //diagonal index - const _Index __n_diag = std::min<_Index>(__q, __n1); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__q - __i_diag - 1], __rng1[__i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(*__res, __q - *__res); - } - else - { - const _Index __q = __i_elem - __n2; //diagonal index - const _Index __n_diag = std::min<_Index>(__n1 - __q, __n2); //diagonal size - auto __res = - std::lower_bound(__diag_it, __diag_it + __n_diag, 1 /*value to find*/, - [&__rng2, &__rng1, __n2, __q, __comp](const auto& __i_diag, const auto& __value) mutable { - const auto __zero_or_one = __comp(__rng2[__n2 - __i_diag - 1], __rng1[__q + __i_diag]); - return __zero_or_one < __value; - }); - return std::make_pair(__q + *__res, __n2 - *__res); - } + using _IndexSigned = std::make_signed_t<_Index>; + + //////////////////////////////////////////////////////////////////////////////////// + // Taking into account the specified constraints of the range of processed data + const _IndexSigned __index_sum = __i_elem - 1; + + _IndexSigned idx1_from = __rng1_from; + _IndexSigned idx1_to = __rng1_to; + + _IndexSigned idx2_from = __index_sum - (__rng1_to - 1); + _IndexSigned idx2_to = __index_sum - __rng1_from + 1; + + const _IndexSigned idx2_from_diff = + idx2_from < (_IndexSigned)__rng2_from ? (_IndexSigned)__rng2_from - idx2_from : 0; + const _IndexSigned idx2_to_diff = idx2_to > (_IndexSigned)__rng2_to ? idx2_to - (_IndexSigned)__rng2_to : 0; + + idx1_to -= idx2_from_diff; + idx1_from += idx2_to_diff; + + idx2_from = __index_sum - (idx1_to - 1); + idx2_to = __index_sum - idx1_from + 1; + + //////////////////////////////////////////////////////////////////////////////////// + // Run search of split point on diagonal + + using __it_t = oneapi::dpl::counting_iterator<_Index>; + + __it_t __diag_it_begin(idx1_from); + __it_t __diag_it_end(idx1_to); + + const __it_t __res = + std::lower_bound(__diag_it_begin, __diag_it_end, false, + [&__rng1, &__rng2, __index_sum, __comp](_Index __idx, const bool __value) mutable { + return __value == __comp(__rng2[__index_sum - __idx], __rng1[__idx]); + }); + + return _split_point_t<_Index>{*__res, __index_sum - *__res + 1}; } // Do serial merge of the data from rng1 (starting from start1) and rng2 (starting from start2) and writing @@ -136,22 +188,205 @@ struct __parallel_merge_submitter<_IdType, __internal::__optional_kernel_name<_N const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); - auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); - __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { - const _IdType __i_elem = __item_id.get_linear_id() * __chunk; - const auto __start = __find_start_point(__rng1, __rng2, __i_elem, __n1, __n2, __comp); - __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, - __comp); + auto __event = __exec.queue().submit( + [&__rng1, &__rng2, &__rng3, __comp, __chunk, __steps, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + __cgh.parallel_for<_Name...>(sycl::range(__steps), [=](sycl::item __item_id) { + const _IdType __i_elem = __item_id.get_linear_id() * __chunk; + const auto __start = + __find_start_point(__rng1, _IdType{0}, __n1, __rng2, _IdType{0}, __n2, __i_elem, __comp); + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, __chunk, __n1, __n2, + __comp); + }); }); + // We should return the same thing in the second param of __future for compatibility + // with the returning value in __parallel_merge_submitter_large::operator() + return __future(__event, std::shared_ptr<__result_and_scratch_storage_base>{}); + } +}; + +template +struct __parallel_merge_submitter_large; + +template +struct __parallel_merge_submitter_large<_IdType, _CustomName, + __internal::__optional_kernel_name<_DiagonalsKernelName...>, + __internal::__optional_kernel_name<_MergeKernelName...>> +{ + private: + struct nd_range_params + { + std::size_t base_diag_count = 0; + std::size_t steps_between_two_base_diags = 0; + _IdType chunk = 0; + _IdType steps = 0; + }; + + // Calculate nd-range parameters + template + nd_range_params + eval_nd_range_params(_ExecutionPolicy&& __exec, const _Range1& __rng1, const _Range2& __rng2) const + { + using _Range1ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + using _Range2ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _RangeValueType = std::conditional_t<(sizeof(_Range1ValueType) > sizeof(_Range2ValueType)), + _Range1ValueType, _Range2ValueType>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + + // Empirical number of values to process per work-item + const std::uint8_t __chunk = __exec.queue().get_device().is_cpu() ? 128 : 4; + + const _IdType __steps = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __chunk); + // TODO required to evaluate this value based on available SLM size for each work-group. + const _IdType __base_diag_count = 32 * 1'024; + const _IdType __steps_between_two_base_diags = + oneapi::dpl::__internal::__dpl_ceiling_div(__steps, __base_diag_count); + + return {__base_diag_count, __steps_between_two_base_diags, __chunk, __steps}; + } + + // Calculation of split points on each base diagonal + template + sycl::event + eval_split_points_for_groups(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Compare __comp, + const nd_range_params& __nd_range_params, + _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + const _IdType __n = __n1 + __n2; + + const _IdType __base_diag_chunk = __nd_range_params.steps_between_two_base_diags * __nd_range_params.chunk; + + return __exec.queue().submit([&__rng1, &__rng2, __comp, __nd_range_params, __base_diagonals_sp_global_storage, + __n1, __n2, __n, __base_diag_chunk](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); + + __cgh.parallel_for<_DiagonalsKernelName...>( + sycl::range(__nd_range_params.base_diag_count + 1), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + + const _IdType __i_elem = __global_idx * __base_diag_chunk; + + __base_diagonals_sp_global_ptr[__global_idx] = + __i_elem == 0 ? _split_point_t<_IdType>{0, 0} + : (__i_elem < __n ? __find_start_point(__rng1, _IdType{0}, __n1, __rng2, + _IdType{0}, __n2, __i_elem, __comp) + : _split_point_t<_IdType>{__n1, __n2}); + }); }); - return __future(__event); + } + + // Process parallel merge + template + sycl::event + run_parallel_merge(const sycl::event& __event, _ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, + _Range3&& __rng3, _Compare __comp, const nd_range_params& __nd_range_params, + const _Storage& __base_diagonals_sp_global_storage) const + { + const _IdType __n1 = __rng1.size(); + const _IdType __n2 = __rng2.size(); + + return __exec.queue().submit([&__event, &__rng1, &__rng2, &__rng3, __comp, __nd_range_params, + __base_diagonals_sp_global_storage, __n1, __n2](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3); + auto __base_diagonals_sp_global_acc = + __base_diagonals_sp_global_storage.template __get_scratch_acc(__cgh); + + __cgh.depends_on(__event); + + __cgh.parallel_for<_MergeKernelName...>( + sycl::range(__nd_range_params.steps), [=](sycl::item __item_id) { + auto __global_idx = __item_id.get_linear_id(); + const _IdType __i_elem = __global_idx * __nd_range_params.chunk; + + auto __base_diagonals_sp_global_ptr = + _Storage::__get_usm_or_buffer_accessor_ptr(__base_diagonals_sp_global_acc); + auto __diagonal_idx = __global_idx / __nd_range_params.steps_between_two_base_diags; + + _split_point_t<_IdType> __start; + if (__global_idx % __nd_range_params.steps_between_two_base_diags != 0) + { + const _split_point_t<_IdType> __sp_left = __base_diagonals_sp_global_ptr[__diagonal_idx]; + const _split_point_t<_IdType> __sp_right = __base_diagonals_sp_global_ptr[__diagonal_idx + 1]; + + __start = __find_start_point(__rng1, __sp_left.first, __sp_right.first, __rng2, + __sp_left.second, __sp_right.second, __i_elem, __comp); + } + else + { + __start = __base_diagonals_sp_global_ptr[__diagonal_idx]; + } + + __serial_merge(__rng1, __rng2, __rng3, __start.first, __start.second, __i_elem, + __nd_range_params.chunk, __n1, __n2, __comp); + }); + }); + } + + public: + template + auto + operator()(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp) const + { + assert(__rng1.size() > 0 || __rng2.size() > 0); + + _PRINT_INFO_IN_DEBUG_MODE(__exec); + + // Calculate nd-range parameters + const nd_range_params __nd_range_params = eval_nd_range_params(__exec, __rng1, __rng2); + + // Create storage to save split-points on each base diagonal + 1 (for the right base diagonal in the last work-group) + auto __p_base_diagonals_sp_global_storage = + new __result_and_scratch_storage<_ExecutionPolicy, _split_point_t<_IdType>>( + __exec, 0, __nd_range_params.base_diag_count + 1); + + // Save the raw pointer into a shared_ptr to return it in __future and extend the lifetime of the storage. + std::shared_ptr<__result_and_scratch_storage_base> __p_result_and_scratch_storage_base( + static_cast<__result_and_scratch_storage_base*>(__p_base_diagonals_sp_global_storage)); + + // Find split-points on the base diagonals + sycl::event __event = eval_split_points_for_groups(__exec, __rng1, __rng2, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + // Merge data using split points on each diagonal + __event = run_parallel_merge(__event, __exec, __rng1, __rng2, __rng3, __comp, __nd_range_params, + *__p_base_diagonals_sp_global_storage); + + return __future(std::move(__event), std::move(__p_result_and_scratch_storage_base)); } }; template class __merge_kernel_name; +template +class __merge_kernel_name_large; + +template +class __diagonals_kernel_name; + +template +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 4 * 1'048'576; // 4 MB +} + +template <> +constexpr std::size_t +__get_starting_size_limit_for_large_submitter() +{ + return 16 * 1'048'576; // 16 MB +} + template auto __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __rng1, @@ -159,24 +394,44 @@ __parallel_merge(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - const auto __n = __rng1.size() + __rng2.size(); - if (__n <= std::numeric_limits::max()) + using __value_type = oneapi::dpl::__internal::__value_t<_Range3>; + + const std::size_t __n = __rng1.size() + __rng2.size(); + if (__n < __get_starting_size_limit_for_large_submitter<__value_type>()) { using _WiIndex = std::uint32_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + static_assert(__get_starting_size_limit_for_large_submitter<__value_type>() <= + std::numeric_limits<_WiIndex>::max()); + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( + return __parallel_merge_submitter<_WiIndex, _MergeKernelName>()( std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__rng3), __comp); } else { - using _WiIndex = std::uint64_t; - using _MergeKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __merge_kernel_name<_CustomName, _WiIndex>>; - return __parallel_merge_submitter<_WiIndex, _MergeKernel>()( - std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__rng3), __comp); + if (__n <= std::numeric_limits::max()) + { + using _WiIndex = std::uint32_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } + else + { + using _WiIndex = std::uint64_t; + using _DiagonalsKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __diagonals_kernel_name<_CustomName, _WiIndex>>; + using _MergeKernelName = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __merge_kernel_name_large<_CustomName, _WiIndex>>; + return __parallel_merge_submitter_large<_WiIndex, _CustomName, _DiagonalsKernelName, _MergeKernelName>()( + std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__rng3), __comp); + } } } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h index a9e60b81c71..70299632223 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge_sort.h @@ -91,8 +91,8 @@ struct __group_merge_path_sorter auto __in_ptr1 = __in_ptr + __start1; auto __in_ptr2 = __in_ptr + __start2; - const std::pair __start = - __find_start_point(__in_ptr1, __in_ptr2, __id_local, __n1, __n2, __comp); + const std::pair __start = __find_start_point( + __in_ptr1, std::uint32_t{0}, __n1, __in_ptr2, std::uint32_t{0}, __n2, __id_local, __comp); // TODO: copy the data into registers before the merge to halve the required amount of SLM __serial_merge(__in_ptr1, __in_ptr2, __out_ptr, __start.first, __start.second, __id, __data_per_workitem, __n1, __n2, __comp); @@ -272,7 +272,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const oneapi::dpl::__ranges::drop_view_simple __rng1(__dst, __offset); const oneapi::dpl::__ranges::drop_view_simple __rng2(__dst, __offset + __n1); - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); + const auto start = __find_start_point(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, + __i_elem_local, __comp); __serial_merge(__rng1, __rng2, __rng /*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); } @@ -281,7 +282,8 @@ struct __merge_sort_global_submitter<_IndexT, __internal::__optional_kernel_name const oneapi::dpl::__ranges::drop_view_simple __rng1(__rng, __offset); const oneapi::dpl::__ranges::drop_view_simple __rng2(__rng, __offset + __n1); - const auto start = __find_start_point(__rng1, __rng2, __i_elem_local, __n1, __n2, __comp); + const auto start = __find_start_point(__rng1, _IndexT{0}, __n1, __rng2, _IndexT{0}, __n2, + __i_elem_local, __comp); __serial_merge(__rng1, __rng2, __dst /*__rng3*/, start.first, start.second, __i_elem, __chunk, __n1, __n2, __comp); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index f4eb557170e..a81bda902ba 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -518,8 +518,17 @@ struct __usm_or_buffer_accessor } }; +// This base class is provided to allow same-typed shared pointer return values from kernels in +// a `__future` for keeping alive temporary data, while allowing run-time branches to lead to +// differently typed temporary storage for kernels. Virtual destructor is required to call +// derived class destructor when leaving scope. +struct __result_and_scratch_storage_base +{ + virtual ~__result_and_scratch_storage_base() = default; +}; + template -struct __result_and_scratch_storage +struct __result_and_scratch_storage : __result_and_scratch_storage_base { private: using __sycl_buffer_t = sycl::buffer<_T, 1>; diff --git a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp index 34cba9f672a..e41f74005e7 100644 --- a/test/parallel_api/algorithm/alg.merge/merge.pass.cpp +++ b/test/parallel_api/algorithm/alg.merge/merge.pass.cpp @@ -41,19 +41,19 @@ struct test_merge // for reverse iterators template void - operator()(Policy&& exec, ::std::reverse_iterator first1, ::std::reverse_iterator last1, - ::std::reverse_iterator first2, ::std::reverse_iterator last2, - ::std::reverse_iterator out_first, ::std::reverse_iterator out_last) + operator()(Policy&& exec, std::reverse_iterator first1, std::reverse_iterator last1, + std::reverse_iterator first2, std::reverse_iterator last2, + std::reverse_iterator out_first, std::reverse_iterator out_last) { using namespace std; - typedef typename ::std::iterator_traits<::std::reverse_iterator>::value_type T; - const auto res = merge(exec, first1, last1, first2, last2, out_first, ::std::greater()); + typedef typename std::iterator_traits>::value_type T; + const auto res = merge(exec, first1, last1, first2, last2, out_first, std::greater()); EXPECT_TRUE(res == out_last, "wrong return result from merge with predicate"); - EXPECT_TRUE(is_sorted(out_first, res, ::std::greater()), "wrong result from merge with predicate"); - EXPECT_TRUE(includes(out_first, res, first1, last1, ::std::greater()), + EXPECT_TRUE(is_sorted(out_first, res, std::greater()), "wrong result from merge with predicate"); + EXPECT_TRUE(includes(out_first, res, first1, last1, std::greater()), "first sequence is not a part of result"); - EXPECT_TRUE(includes(out_first, res, first2, last2, ::std::greater()), + EXPECT_TRUE(includes(out_first, res, first2, last2, std::greater()), "second sequence is not a part of result"); } }; @@ -79,47 +79,41 @@ struct test_merge_compare template void - operator()(Policy&& exec, ::std::reverse_iterator first1, ::std::reverse_iterator last1, - ::std::reverse_iterator first2, ::std::reverse_iterator last2, - ::std::reverse_iterator out_first, ::std::reverse_iterator out_last, + operator()(Policy&& exec, std::reverse_iterator first1, std::reverse_iterator last1, + std::reverse_iterator first2, std::reverse_iterator last2, + std::reverse_iterator out_first, std::reverse_iterator out_last, Compare /* comp */) { using namespace std; - typedef typename ::std::iterator_traits<::std::reverse_iterator>::value_type T; - const auto res = merge(exec, first1, last1, first2, last2, out_first, ::std::greater()); + typedef typename std::iterator_traits>::value_type T; + const auto res = merge(exec, first1, last1, first2, last2, out_first, std::greater()); EXPECT_TRUE(res == out_last, "wrong return result from merge with predicate"); - EXPECT_TRUE(is_sorted(out_first, res, ::std::greater()), "wrong result from merge with predicate"); - EXPECT_TRUE(includes(out_first, res, first1, last1, ::std::greater()), + EXPECT_TRUE(is_sorted(out_first, res, std::greater()), "wrong result from merge with predicate"); + EXPECT_TRUE(includes(out_first, res, first1, last1, std::greater()), "first sequence is not a part of result"); - EXPECT_TRUE(includes(out_first, res, first2, last2, ::std::greater()), + EXPECT_TRUE(includes(out_first, res, first2, last2, std::greater()), "second sequence is not a part of result"); } }; -template +template void -test_merge_by_type(Generator1 generator1, Generator2 generator2) +test_merge_by_type(Generator1 generator1, Generator2 generator2, size_t start_size, size_t max_size, FStep fstep) { using namespace std; - size_t max_size = 100000; Sequence in1(max_size, generator1); Sequence in2(max_size / 2, generator2); Sequence out(in1.size() + in2.size()); - ::std::sort(in1.begin(), in1.end()); - ::std::sort(in2.begin(), in2.end()); + std::sort(in1.begin(), in1.end()); + std::sort(in2.begin(), in2.end()); - size_t start_size = 0; -#if TEST_DPCPP_BACKEND_PRESENT - start_size = 2; -#endif - - for (size_t size = start_size; size <= max_size; size = size <= 16 ? size + 1 : size_t(3.1415 * size)) { + for (size_t size = start_size; size <= max_size; size = fstep(size)) { #if !TEST_DPCPP_BACKEND_PRESENT invoke_on_all_policies<0>()(test_merge(), in1.cbegin(), in1.cbegin() + size, in2.data(), in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size); invoke_on_all_policies<1>()(test_merge_compare(), in1.cbegin(), in1.cbegin() + size, in2.data(), - in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size, ::std::less()); + in2.data() + size / 2, out.begin(), out.begin() + 1.5 * size, std::less()); #endif // Currently test harness doesn't execute the testcase for inputs with more than 1000 elements for const iterators to optimize execution time, @@ -128,17 +122,34 @@ test_merge_by_type(Generator1 generator1, Generator2 generator2) invoke_on_all_policies<2>()(test_merge(), in1.begin(), in1.begin() + size, in2.cbegin(), in2.cbegin() + size / 2, out.begin(), out.begin() + 1.5 * size); invoke_on_all_policies<3>()(test_merge_compare(), in1.begin(), in1.begin() + size, in2.cbegin(), - in2.cbegin() + size / 2, out.begin(), out.begin() + 1.5 * size, ::std::less()); + in2.cbegin() + size / 2, out.begin(), out.begin() + 1.5 * size, std::less()); #if !TEST_DPCPP_BACKEND_PRESENT invoke_on_all_policies<4>()(test_merge(), in1.data(), in1.data() + size, in2.cbegin(), in2.cbegin() + size / 2, out.begin(), out.begin() + 3 * size / 2); invoke_on_all_policies<5>()(test_merge_compare(), in1.data(), in1.data() + size, in2.cbegin(), - in2.cbegin() + size / 2, out.begin(), out.begin() + 3 * size / 2, ::std::less()); + in2.cbegin() + size / 2, out.begin(), out.begin() + 3 * size / 2, std::less()); #endif } } +template +void +test_merge_by_type(size_t start_size, size_t max_size, FStep fstep) +{ + test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }, start_size, max_size, fstep); +#if !ONEDPL_FPGA_DEVICE + test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }, start_size, max_size, fstep); +#endif + +#if !TEST_DPCPP_BACKEND_PRESENT + // Wrapper has atomic increment in ctor. It's not allowed in kernel + test_merge_by_type>([](size_t v) { return Wrapper(v % 100); }, + [](size_t v) { return Wrapper(v % 10); }, + start_size, max_size, fstep); +#endif +} + template struct test_non_const { @@ -146,7 +157,7 @@ struct test_non_const void operator()(Policy&& exec, InputIterator input_iter, OutputIterator out_iter) { - merge(exec, input_iter, input_iter, input_iter, input_iter, out_iter, non_const(::std::less())); + merge(exec, input_iter, input_iter, input_iter, input_iter, out_iter, non_const(std::less())); } }; @@ -166,18 +177,31 @@ struct test_merge_tuple int main() { - test_merge_by_type([](size_t v) { return (v % 2 == 0 ? v : -v) * 3; }, [](size_t v) { return v * 2; }); -#if !ONEDPL_FPGA_DEVICE - test_merge_by_type([](size_t v) { return float64_t(v); }, [](size_t v) { return float64_t(v - 100); }); +#if TEST_DPCPP_BACKEND_PRESENT + const size_t start_size_small = 2; +#else + const size_t start_size_small = 0; +#endif + const size_t max_size_small = 100000; + auto fstep_small = [](std::size_t size){ return size <= 16 ? size + 1 : size_t(3.1415 * size);}; + test_merge_by_type(start_size_small, max_size_small, fstep_small); + + // Large data sizes (on GPU only) +#if TEST_DPCPP_BACKEND_PRESENT + if (!TestUtils::get_test_queue().get_device().is_cpu()) + { + const size_t start_size_large = 4'000'000; + const size_t max_size_large = 8'000'000; + auto fstep_large = [](std::size_t size){ return size + 2'000'000; }; + test_merge_by_type(start_size_large, max_size_large, fstep_large); + } #endif #if !TEST_DPCPP_BACKEND_PRESENT - // Wrapper has atomic increment in ctor. It's not allowed in kernel - test_merge_by_type>([](size_t v) { return Wrapper(v % 100); }, - [](size_t v) { return Wrapper(v % 10); }); test_algo_basic_double(run_for_rnd_fw>()); #endif + using T = std::tuple; //a pair (key, value) std::vector a = { {1, 2}, {1, 2}, {1,2}, {1,2}, {1, 2}, {1, 2} }; std::vector b = { {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1} };