From f1eeb9e755aeda5c152552057c4278f4c6834bd5 Mon Sep 17 00:00:00 2001 From: "Fedorov, Andrey" Date: Tue, 31 Oct 2023 07:52:27 -0700 Subject: [PATCH 1/5] add an overload for memory_required in joint_sorter Signed-off-by: Fedorov, Andrey --- .../sycl_ext_oneapi_group_sort.asciidoc | 122 ++++++++++++------ 1 file changed, 83 insertions(+), 39 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc index dd1e3acd5107a..5dd97661254b6 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc @@ -516,6 +516,9 @@ namespace sycl::ext::oneapi::experimental { template static constexpr size_t memory_required(sycl::memory_scope scope, std::size_t range_size); // (3) + template + static constexpr size_t + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (4) }; template - group_sorter(sycl::span scratch, Compare comp = {}); // (4) + group_sorter(sycl::span scratch, Compare comp = {}); // (5) template - T operator()(Group g, T value); // (5) + T operator()(Group g, T value); // (6) template void operator()(Group g, sycl::span values, - Properties properties); // (6) + Properties properties); // (7) static constexpr size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (7) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (8) + static constexpr size_t + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (9) }; template group_key_value_sorter(sycl::span scratch, - Compare comp = {}); // (8) + Compare comp = {}); // (10) template - std::tuple operator()(Group g, T key, U value); // (9) + std::tuple operator()(Group g, T key, U value); // (11) template void operator()(Group g, sycl::span keys, sycl::span values, - Properties property); // (10) + Properties property); // (12) static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (11) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (13) + static constexpr std::size_t + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (14) }; } @@ -573,13 +580,13 @@ namespace sycl::ext::oneapi::experimental { template joint_sorter(sycl::span scratch, const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (12) + std::bitset (std::numeric_limits::max())); // (15) template - void operator()(Group g, Ptr first, Ptr last); // (13) + void operator()(Group g, Ptr first, Ptr last); // (16) static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (14) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (17) }; template group_sorter(sycl::span scratch, const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (15) + std::bitset (std::numeric_limits::max())); // (18) template - T operator()(Group g, T value); // (16) + T operator()(Group g, T value); // (19) template void operator()(Group g, sycl::span values, - Properties properties); // (17) + Properties properties); // (20) static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (18) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (21) }; template group_key_value_sorter(sycl::span scratch, const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (19) + std::bitset (std::numeric_limits::max())); // (22) template - std::tuple operator()(Group g, T key, U value); // (20) + std::tuple operator()(Group g, T key, U value); // (23) template void operator()(Group g, sycl::span keys, sycl::span values, - Properties properties); // (21) + Properties properties); // (24) static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (22) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (25) }; } @@ -659,7 +666,7 @@ callers side. Size of required memory (bytes) is defined by calling `memory_required`. |=== -(1), (4), (8) create the object using `comp`. +(1), (5), (10) create the object using `comp`. Additional memory for the algorithm is provided using `scratch`. If `scratch.size()` is less than the value returned by `memory_required`, behavior of the corresponding sorting algorithm @@ -674,6 +681,20 @@ _Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. the default sorting algorithm defined by the sorter calling by `joint_sort`. `range_size` represents a range size for sorting, e.g. `last-first` from `operator()` arguments. +It must be called within a SYCL kernel. +Result depends on the `scope` parameter: +use `sycl::memory_scope::work_group` to get memory size required +for each work-group; +use `sycl::memory_scope::sub_group` to get memory size required +for each sub-group. +If other `scope` values are passed, behavior is unspecified. + +(4) Returns size of temporary memory (in bytes) that is required by +the default sorting algorithm defined by the sorter calling by `joint_sort` +depending on `d`. +`range_size` represents a range size for sorting, +e.g. `last-first` from `operator()` arguments. +It mustn't be called within a SYCL kernel, only on host. Result depends on the `scope` parameter: use `sycl::memory_scope::work_group` to get memory size required for each work-group; @@ -681,62 +702,87 @@ use `sycl::memory_scope::sub_group` to get memory size required for each sub-group. If other `scope` values are passed, behavior is unspecified. -(5) Implements a default sorting algorithm to be called by +(6) Implements a default sorting algorithm to be called by the `sort_over_group` algorithm. _Complexity_: Let `N` be the `Group` size. `O(N*log(N)*log(N))` comparisons. -(6) Implements a default sorting algorithm that is called by +(7) Implements a default sorting algorithm that is called by `sort_over_group` and that accepts the `sycl::span` value as an input parameter. _Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. `O(N*log(N)*log(N))` comparisons. -(7) Returns the size of temporary memory (in bytes) that is required by the default +(8) Returns the size of temporary memory (in bytes) that is required by the default sorting algorithm defined by the sorter calling by `sort_over_group`. `ElementsPerWorkItem` is the extent parameter for `sycl::span` that is an input parameter for `sort_over_group`. +It must be called within a SYCL kernel. If `scope == sycl::memory_scope::work_group`, `range_size` is the size of the local range for `sycl::nd_range` that was used to run the kernel; if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -(9) Implements a default key-value sorting algorithm that is called +(9) Returns the size of temporary memory (in bytes) that is required by the default +sorting algorithm defined by the sorter calling by `sort_over_group` +depending on `d`. +`ElementsPerWorkItem` is the extent parameter for `sycl::span` +that is an input parameter for `sort_over_group`. +It mustn't be called within a SYCL kernel, only on host. +If `scope == sycl::memory_scope::work_group`, +`range_size` is the size of the local range for `sycl::nd_range` +that was used to run the kernel; +if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. +If other `scope` values are passed, behavior is unspecified. + +(11) Implements a default key-value sorting algorithm that is called by `sort_key_value_over_group` and that doesn't accept `sycl::span` values as input parameters. _Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. `O(N*log(N)*log(N))` comparisons. -(10) Implements a default key-value sorting algorithm that is called +(12) Implements a default key-value sorting algorithm that is called by `sort_key_value_over_group` and that accepts `sycl::span` values as input parameters. _Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. `O(N*log(N)*log(N))` comparisons. -(11) Returns size of temporary memory (in bytes) that is required by +(13) Returns size of temporary memory (in bytes) that is required by the default key-value sorting algorithm defined by the sorter calling by `sort_key_value_over_group`. +It must be called within a SYCL kernel. If `scope == sycl::memory_scope::work_group`, `range_size` is the size of the local range for `sycl::nd_range` that was used to run the kernel; if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -(12), (15), (19) create +(14) Returns size of temporary memory (in bytes) that is required by +the default key-value +sorting algorithm defined by the sorter calling by `sort_key_value_over_group` +depending on `d`. +It mustn't be called within a SYCL kernel, only on host. +If `scope == sycl::memory_scope::work_group`, +`range_size` is the size of the local range for `sycl::nd_range` +that was used to run the kernel; +if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. +If other `scope` values are passed, behavior is unspecified. + +(15), (18), (22) create the class object to sort values considering only bits that corresponds to 1 in `mask`. Additional memory for the algorithm is provided using `scratch`. If `scratch.size()` is less than the value returned by `memory_required`, behavior of the corresponding sorting algorithm is undefined. -(13) Implements the radix sorting algorithm to be called by +(16) Implements the radix sorting algorithm to be called by the `joint_sort` algorithm. -(14) Returns size of temporary memory (in bytes) that is required by +(17) Returns size of temporary memory (in bytes) that is required by the radix sort algorithm calling by `joint_sort`. `range_size` represents a range size for sorting, @@ -748,14 +794,14 @@ use `sycl::memory_scope::sub_group` to get memory size required for each sub-group. If other `scope` values are passed, behavior is unspecified. -(16) Implements the radix sorting algorithm to be called by +(19) Implements the radix sorting algorithm to be called by the `sort_over_group` algorithm. -(17) Implements the radix sorting algorithm that is called by +(20) Implements the radix sorting algorithm that is called by `sort_over_group` and that accepts the `sycl::span` value as an input parameter. -(18) Returns size of temporary memory (in bytes) that is required by the radix +(21) Returns size of temporary memory (in bytes) that is required by the radix sorting algorithm defined by the sorter calling by `sort_over_group`. `ElementsPerWorkItem` is a parameter for `sycl::span` that is an input parameter for `sort_over_group`, where `T` is @@ -766,15 +812,15 @@ that was used to run the kernel; if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -(20) Implements the radix sorting algorithm that is called +(23) Implements the radix sorting algorithm that is called by `sort_key_value_over_group` and that doesn't accept `sycl::span` values as input parameters. -(21) Implements the radix key-value sorting algorithm that is called +(24) Implements the radix key-value sorting algorithm that is called by `sort_key_value_over_group` and that accepts `sycl::span` values as input parameters. -(22) Returns size of temporary memory (in bytes) that is required by the radix key-value +(25) Returns size of temporary memory (in bytes) that is required by the radix key-value sorting algorithm defined by the sorter calling by `sort_key_value_over_group` with `sycl::span` and `sycl::span` as input parameters. @@ -998,7 +1044,7 @@ namespace my_sycl = sycl::ext::oneapi::experimental; // calculate required local memory size size_t temp_memory_size = my_sycl::default_sorters::joint_sorter<>::memory_required( - sycl::memory_scope::work_group, n); + d, sycl::memory_scope::work_group, n); q.submit([&](sycl::handler& h) { auto acc = sycl::accessor(buf, h); @@ -1075,7 +1121,7 @@ using TupleType = // calculate required local memory size size_t temp_memory_size = my_sycl::default_sorters::joint_sorter<>::memory_required( - sycl::memory_scope::work_group, n); + d, sycl::memory_scope::work_group, n); q.submit([&](sycl::handler& h) { auto keys_acc = sycl::accessor(keys_buf, h); @@ -1185,8 +1231,6 @@ because it's easy to pass different comparator types. . Think about reducing overloads for sorting functions. The thing is that overloads with `Compare` objects seems extra and overloads with sorters, without sorters are enough. -. It would be better if `memory_required` methods had a `sycl::device` parameter -because different devices can require different amount of memory. == Non-implemented features Please, note that following is not inplemented yet for the open-source repo: From ba40850413cf42b981fe847fc4082bb4e6a835c6 Mon Sep 17 00:00:00 2001 From: "Fedorov, Andrey" Date: Thu, 9 Nov 2023 12:07:05 -0800 Subject: [PATCH 2/5] constexpr is extra for default_sorter --- .../proposed/sycl_ext_oneapi_group_sort.asciidoc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc index 5dd97661254b6..b71b5066e23e0 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc @@ -514,10 +514,10 @@ namespace sycl::ext::oneapi::experimental { void operator()(Group g, Ptr first, Ptr last); // (2) template - static constexpr size_t + static size_t memory_required(sycl::memory_scope scope, std::size_t range_size); // (3) template - static constexpr size_t + static size_t memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (4) }; @@ -537,9 +537,9 @@ namespace sycl::ext::oneapi::experimental { sycl::span values, Properties properties); // (7) - static constexpr size_t + static size_t memory_required(sycl::memory_scope scope, std::size_t range_size); // (8) - static constexpr size_t + static size_t memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (9) }; @@ -562,9 +562,9 @@ namespace sycl::ext::oneapi::experimental { sycl::span values, Properties property); // (12) - static constexpr std::size_t + static std::size_t memory_required(sycl::memory_scope scope, std::size_t range_size); // (13) - static constexpr std::size_t + static std::size_t memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (14) }; } From b9d8057d39ed9da8b66c6a7f41b3e6b7af8a9b20 Mon Sep 17 00:00:00 2001 From: "Fedorov, Andrey" Date: Thu, 9 Nov 2023 12:12:21 -0800 Subject: [PATCH 3/5] added a version --- sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc index b71b5066e23e0..424e490fe3824 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc @@ -1250,4 +1250,5 @@ Please, note that following is not inplemented yet for the open-source repo: |3|2021-12-16|Andrey Fedorov|Some refactoring, sections reordering, making the entire extension experimental |4|2022-11-14|Andrey Fedorov|Fixed size arrays, key-value sorting and properties +|5|2023-11-09|Andrey Fedorov|Changed `memory_required` functions for default sorters |======================================== From 8a8ba3bfb30f7190b7abd50dc8c78b0d5bd0d9e7 Mon Sep 17 00:00:00 2001 From: Andrei Fedorov Date: Mon, 29 Jan 2024 10:49:04 +0100 Subject: [PATCH 4/5] Add new lines --- .../extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc index 424e490fe3824..196c91ad6a682 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc @@ -516,6 +516,7 @@ namespace sycl::ext::oneapi::experimental { template static size_t memory_required(sycl::memory_scope scope, std::size_t range_size); // (3) + template static size_t memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (4) @@ -539,6 +540,7 @@ namespace sycl::ext::oneapi::experimental { static size_t memory_required(sycl::memory_scope scope, std::size_t range_size); // (8) + static size_t memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (9) }; @@ -564,6 +566,7 @@ namespace sycl::ext::oneapi::experimental { static std::size_t memory_required(sycl::memory_scope scope, std::size_t range_size); // (13) + static std::size_t memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (14) }; From 3b2b9ecf8c647e03590923d208f2a5758314b429 Mon Sep 17 00:00:00 2001 From: Andrei Fedorov Date: Mon, 29 Jan 2024 19:35:35 +0100 Subject: [PATCH 5/5] Remove (3) and similar to it form default sorters --- .../sycl_ext_oneapi_group_sort.asciidoc | 115 ++++++------------ 1 file changed, 36 insertions(+), 79 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc index 196c91ad6a682..b1e305ec649cd 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc @@ -515,11 +515,7 @@ namespace sycl::ext::oneapi::experimental { template static size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (3) - - template - static size_t - memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (4) + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (3) }; template - group_sorter(sycl::span scratch, Compare comp = {}); // (5) + group_sorter(sycl::span scratch, Compare comp = {}); // (4) template - T operator()(Group g, T value); // (6) + T operator()(Group g, T value); // (5) template void operator()(Group g, sycl::span values, - Properties properties); // (7) - - static size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (8) + Properties properties); // (6) static size_t - memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (9) + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (7) }; template group_key_value_sorter(sycl::span scratch, - Compare comp = {}); // (10) + Compare comp = {}); // (8) template - std::tuple operator()(Group g, T key, U value); // (11) + std::tuple operator()(Group g, T key, U value); // (9) template void operator()(Group g, sycl::span keys, sycl::span values, - Properties property); // (12) + Properties property); // (10) static std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (13) - - static std::size_t - memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (14) + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (11) }; } @@ -583,13 +573,13 @@ namespace sycl::ext::oneapi::experimental { template joint_sorter(sycl::span scratch, const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (15) + std::bitset (std::numeric_limits::max())); // (12) template - void operator()(Group g, Ptr first, Ptr last); // (16) + void operator()(Group g, Ptr first, Ptr last); // (13) static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (17) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (14) }; template group_sorter(sycl::span scratch, const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (18) + std::bitset (std::numeric_limits::max())); // (15) template - T operator()(Group g, T value); // (19) + T operator()(Group g, T value); // (16) template void operator()(Group g, sycl::span values, - Properties properties); // (20) + Properties properties); // (17) static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (21) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (18) }; template group_key_value_sorter(sycl::span scratch, const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (22) + std::bitset (std::numeric_limits::max())); // (19) template - std::tuple operator()(Group g, T key, U value); // (23) + std::tuple operator()(Group g, T key, U value); // (20) template void operator()(Group g, sycl::span keys, sycl::span values, - Properties properties); // (24) + Properties properties); // (21) static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (25) + memory_required(sycl::memory_scope scope, std::size_t range_size); // (22) }; } @@ -669,7 +659,7 @@ callers side. Size of required memory (bytes) is defined by calling `memory_required`. |=== -(1), (5), (10) create the object using `comp`. +(1), (4), (8) create the object using `comp`. Additional memory for the algorithm is provided using `scratch`. If `scratch.size()` is less than the value returned by `memory_required`, behavior of the corresponding sorting algorithm @@ -681,18 +671,6 @@ the `joint_sort` algorithm. _Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. (3) Returns size of temporary memory (in bytes) that is required by -the default sorting algorithm defined by the sorter calling by `joint_sort`. -`range_size` represents a range size for sorting, -e.g. `last-first` from `operator()` arguments. -It must be called within a SYCL kernel. -Result depends on the `scope` parameter: -use `sycl::memory_scope::work_group` to get memory size required -for each work-group; -use `sycl::memory_scope::sub_group` to get memory size required -for each sub-group. -If other `scope` values are passed, behavior is unspecified. - -(4) Returns size of temporary memory (in bytes) that is required by the default sorting algorithm defined by the sorter calling by `joint_sort` depending on `d`. `range_size` represents a range size for sorting, @@ -705,30 +683,19 @@ use `sycl::memory_scope::sub_group` to get memory size required for each sub-group. If other `scope` values are passed, behavior is unspecified. -(6) Implements a default sorting algorithm to be called by +(5) Implements a default sorting algorithm to be called by the `sort_over_group` algorithm. _Complexity_: Let `N` be the `Group` size. `O(N*log(N)*log(N))` comparisons. -(7) Implements a default sorting algorithm that is called by +(6) Implements a default sorting algorithm that is called by `sort_over_group` and that accepts the `sycl::span` value as an input parameter. _Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. `O(N*log(N)*log(N))` comparisons. -(8) Returns the size of temporary memory (in bytes) that is required by the default -sorting algorithm defined by the sorter calling by `sort_over_group`. -`ElementsPerWorkItem` is the extent parameter for `sycl::span` -that is an input parameter for `sort_over_group`. -It must be called within a SYCL kernel. -If `scope == sycl::memory_scope::work_group`, -`range_size` is the size of the local range for `sycl::nd_range` -that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. -If other `scope` values are passed, behavior is unspecified. - -(9) Returns the size of temporary memory (in bytes) that is required by the default +(7) Returns the size of temporary memory (in bytes) that is required by the default sorting algorithm defined by the sorter calling by `sort_over_group` depending on `d`. `ElementsPerWorkItem` is the extent parameter for `sycl::span` @@ -740,31 +707,21 @@ that was used to run the kernel; if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -(11) Implements a default key-value sorting algorithm that is called +(9) Implements a default key-value sorting algorithm that is called by `sort_key_value_over_group` and that doesn't accept `sycl::span` values as input parameters. _Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. `O(N*log(N)*log(N))` comparisons. -(12) Implements a default key-value sorting algorithm that is called +(10) Implements a default key-value sorting algorithm that is called by `sort_key_value_over_group` and that accepts `sycl::span` values as input parameters. _Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. `O(N*log(N)*log(N))` comparisons. -(13) Returns size of temporary memory (in bytes) that is required by -the default key-value -sorting algorithm defined by the sorter calling by `sort_key_value_over_group`. -It must be called within a SYCL kernel. -If `scope == sycl::memory_scope::work_group`, -`range_size` is the size of the local range for `sycl::nd_range` -that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. -If other `scope` values are passed, behavior is unspecified. - -(14) Returns size of temporary memory (in bytes) that is required by +(11) Returns size of temporary memory (in bytes) that is required by the default key-value sorting algorithm defined by the sorter calling by `sort_key_value_over_group` depending on `d`. @@ -775,17 +732,17 @@ that was used to run the kernel; if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -(15), (18), (22) create +(12), (15), (19) create the class object to sort values considering only bits that corresponds to 1 in `mask`. Additional memory for the algorithm is provided using `scratch`. If `scratch.size()` is less than the value returned by `memory_required`, behavior of the corresponding sorting algorithm is undefined. -(16) Implements the radix sorting algorithm to be called by +(13) Implements the radix sorting algorithm to be called by the `joint_sort` algorithm. -(17) Returns size of temporary memory (in bytes) that is required by +(14) Returns size of temporary memory (in bytes) that is required by the radix sort algorithm calling by `joint_sort`. `range_size` represents a range size for sorting, @@ -797,14 +754,14 @@ use `sycl::memory_scope::sub_group` to get memory size required for each sub-group. If other `scope` values are passed, behavior is unspecified. -(19) Implements the radix sorting algorithm to be called by +(16) Implements the radix sorting algorithm to be called by the `sort_over_group` algorithm. -(20) Implements the radix sorting algorithm that is called by +(17) Implements the radix sorting algorithm that is called by `sort_over_group` and that accepts the `sycl::span` value as an input parameter. -(21) Returns size of temporary memory (in bytes) that is required by the radix +(18) Returns size of temporary memory (in bytes) that is required by the radix sorting algorithm defined by the sorter calling by `sort_over_group`. `ElementsPerWorkItem` is a parameter for `sycl::span` that is an input parameter for `sort_over_group`, where `T` is @@ -815,15 +772,15 @@ that was used to run the kernel; if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -(23) Implements the radix sorting algorithm that is called +(20) Implements the radix sorting algorithm that is called by `sort_key_value_over_group` and that doesn't accept `sycl::span` values as input parameters. -(24) Implements the radix key-value sorting algorithm that is called +(21) Implements the radix key-value sorting algorithm that is called by `sort_key_value_over_group` and that accepts `sycl::span` values as input parameters. -(25) Returns size of temporary memory (in bytes) that is required by the radix key-value +(22) Returns size of temporary memory (in bytes) that is required by the radix key-value sorting algorithm defined by the sorter calling by `sort_key_value_over_group` with `sycl::span` and `sycl::span` as input parameters.