Skip to content

Commit

Permalink
[SYCL][Fusion] Test and document group algorithms/function support (#…
Browse files Browse the repository at this point in the history
…12644)

Add tests for `group_broadcast` and `permute_group_by_xor` when
remapping is needed.

Document support for group algorithms and functions in the kernel fusion
design document. Kernel fusion supports these constructs, as local
range, group id and local id do not change due to remapping.

Signed-off-by: Victor Perez <victor.perez@codeplay.com>
  • Loading branch information
victor-eds authored Feb 14, 2024
1 parent 7e414a9 commit ca4ed6e
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 5 deletions.
14 changes: 9 additions & 5 deletions sycl/doc/design/KernelFusionJIT.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,18 +226,15 @@ This remapping consists on an inter-procedural pass replacing each built-in quer
First of all, work-item remapping will always be performed when the list of input nd-ranges is heterogeneous. Additional remapping conditions are present for the following work-item components. For each input kernel:

- `num_work_groups` and `local_size`: Only performed if the input nd-range has an explicit local size, may result in better performance, as this replaces built-in calls with constants;
- `global_id`, `local_id` and `group_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
- `global_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
- `local_id` and `group_id`: Never needed as per [kernel fusion restrictions](#restrictions). These are invariant after fusion.

Once this rules are set, also taking into account remapping constraints, the remapping is performed as follows for each input kernel:

- `global_id`:
- `global_id(0) = GLID / (global_size(1) * global_size(2))`
- `global_id(1) = (GLID / global_size(2)) % global_size(1)`
- `global_id(2) = GLID % global_size(2)`
- `local_id`:
- `local_id(x) = global_id(x) % local_size(x)`
- `group_id`:
- `group_id(x) = global_id(x) / local_size(x)`
- `num_work_groups`:
- `num_work_groups(x) = global_size(x) / local_size(x)`
- `global_size`:
Expand Down Expand Up @@ -348,6 +345,13 @@ q.submit([&](sycl::handler &cgh) {
sycl::detail::strategy::group_reduce_and_last_wg_detection>(...);
});
```
### Group Algorithms and Functions

Kernel fusion supports group algorithms and functions. As per [remapping
rules](#work-item-remapping), group ID and local ID are invariant after fusion
even when different ND-ranges are involved. This way, group functions and
algorithms conceptually executed for a given group and using a given local ID
as, e.g., the `group_broadcast` local ID, will keep semantics after fusion.

### Unsupported SYCL constructs

Expand Down
108 changes: 108 additions & 0 deletions sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

// Test fusion works with permute and remapping.

#include <sycl/sycl.hpp>

#include "../helpers.hpp"
#include "sycl/group_algorithm.hpp"

using namespace sycl;

class FillKernel;
class Kernel0;
class Kernel1;

int main() {
constexpr size_t dataSize = 512;
constexpr size_t localSize = 16;
std::array<int, dataSize / localSize> in;
std::array<int, dataSize> out0;
std::array<int, dataSize / 2> out1;
// Needed to check results
size_t sg_size = 0;

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
{
buffer<int> buff_in{in};
buffer<int> buff_out0{out0};
buffer<int> buff_out1{out1};
buffer<size_t> buff_sg_size{&sg_size, 1};

ext::codeplay::experimental::fusion_wrapper fw{q};

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, write_only, no_init);
cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
[=](nd_item<1> i) {
if (i.get_local_id() == 0) {
auto j = i.get_group(0);
in[j] = static_cast<int>(j);
}
});
});

fw.start_fusion();

q.submit([&](handler &cgh) {
accessor sg_size(buff_sg_size, cgh, write_only, no_init);
accessor in(buff_in, cgh, read_only);
accessor out(buff_out0, cgh, write_only, no_init);
cgh.parallel_for<Kernel0>(
nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
sub_group group = i.get_sub_group();
int gid = i.get_global_id();
int sgid = group.get_group_id();
sg_size[0] = group.get_max_local_range()[0];
out[gid] = permute_group_by_xor(
group, gid, sgid % group.get_max_local_range()[0]);
});
});

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, read_only);
accessor out(buff_out1, cgh, write_only, no_init);
cgh.parallel_for<Kernel1>(
nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
sub_group group = i.get_sub_group();
int gid = i.get_global_id();
int sgid = group.get_group_id();
out[gid] = permute_group_by_xor(
group, gid, sgid % group.get_max_local_range()[0]);
});
});

complete_fusion_with_check(fw);
}

// Check the results
int SGid = 0;
int SGLid = 0;
int SGBeginGid = 0;
int j = 0;
const auto check = [sg_size, &SGid, &SGLid, &SGBeginGid, &out0,
&out1](int j, bool checkSmall) {
if (j % localSize % sg_size == 0) {
SGid++;
SGLid = 0;
SGBeginGid = j;
}
if (j % localSize == 0) {
SGid = 0;
SGLid = 0;
SGBeginGid = j;
}
assert(out0[j] == SGBeginGid + (SGLid ^ (SGid % sg_size)));
assert(!checkSmall || (out1[j] == SGBeginGid + (SGLid ^ (SGid % sg_size))));
SGLid++;
};
for (int end = dataSize / 2; j < end; j++) {
check(j, true);
}
for (int end = dataSize; j < end; j++) {
check(j, false);
}

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

// Test fusion works with group_broadcast and remapping.

#include <sycl/sycl.hpp>

#include "../helpers.hpp"

using namespace sycl;

class FillKernel;
class Kernel0;
class Kernel1;

int main() {
constexpr size_t dataSize = 512;
constexpr size_t localSize = 16;
std::array<int, dataSize / localSize> in;
std::array<int, dataSize> out0;
std::array<int, dataSize / 2> out1;

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
{
buffer<int> buff_in{in};
buffer<int> buff_out0{out0};
buffer<int> buff_out1{out1};

ext::codeplay::experimental::fusion_wrapper fw{q};

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, write_only, no_init);
cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
[=](nd_item<1> i) {
if (i.get_local_id() == 0) {
auto j = i.get_group(0);
in[j] = static_cast<int>(j);
}
});
});

fw.start_fusion();

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, read_only);
accessor out(buff_out0, cgh, write_only, no_init);
cgh.parallel_for<Kernel0>(
nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
auto group = i.get_group();
out[i.get_global_id()] = group_broadcast(
group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
1);
});
});

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, read_only);
accessor out(buff_out1, cgh, write_only, no_init);
cgh.parallel_for<Kernel1>(
nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
auto group = i.get_group();
out[i.get_global_id()] = group_broadcast(
group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
1);
});
});

complete_fusion_with_check(fw);
}

// Check the results
int i = 0;
for (int end = dataSize / 2; i < end; ++i) {
int group_id = i / static_cast<int>(localSize);
assert(out0[i] == group_id && "Computation error");
assert(out1[i] == group_id && "Computation error");
}

for (int end = dataSize; i < end; ++i) {
int group_id = i / static_cast<int>(localSize);
assert(out0[i] == group_id && "Computation error");
}

return 0;
}

0 comments on commit ca4ed6e

Please sign in to comment.