Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Fusion] Test and document group algorithms/function support #12644

Merged
merged 1 commit into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions sycl/doc/design/KernelFusionJIT.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,18 +226,15 @@ This remapping consists on an inter-procedural pass replacing each built-in quer
First of all, work-item remapping will always be performed when the list of input nd-ranges is heterogeneous. Additional remapping conditions are present for the following work-item components. For each input kernel:

- `num_work_groups` and `local_size`: Only performed if the input nd-range has an explicit local size, may result in better performance, as this replaces built-in calls with constants;
- `global_id`, `local_id` and `group_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
- `global_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
- `local_id` and `group_id`: Never needed as per [kernel fusion restrictions](#restrictions). These are invariant after fusion.

Once this rules are set, also taking into account remapping constraints, the remapping is performed as follows for each input kernel:

- `global_id`:
- `global_id(0) = GLID / (global_size(1) * global_size(2))`
- `global_id(1) = (GLID / global_size(2)) % global_size(1)`
- `global_id(2) = GLID % global_size(2)`
victor-eds marked this conversation as resolved.
Show resolved Hide resolved
- `local_id`:
- `local_id(x) = global_id(x) % local_size(x)`
- `group_id`:
- `group_id(x) = global_id(x) / local_size(x)`
victor-eds marked this conversation as resolved.
Show resolved Hide resolved
- `num_work_groups`:
- `num_work_groups(x) = global_size(x) / local_size(x)`
- `global_size`:
Expand Down Expand Up @@ -348,6 +345,13 @@ q.submit([&](sycl::handler &cgh) {
sycl::detail::strategy::group_reduce_and_last_wg_detection>(...);
});
```
### Group Algorithms and Functions

Kernel fusion supports group algorithms and functions. As per [remapping
rules](#work-item-remapping), group ID and local ID are invariant after fusion
even when different ND-ranges are involved. This way, group functions and
algorithms conceptually executed for a given group and using a given local ID
as, e.g., the `group_broadcast` local ID, will keep semantics after fusion.

### Unsupported SYCL constructs

Expand Down
108 changes: 108 additions & 0 deletions sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

// Test fusion works with permute and remapping.

#include <sycl/sycl.hpp>

#include "../helpers.hpp"
#include "sycl/group_algorithm.hpp"

using namespace sycl;

class FillKernel;
class Kernel0;
class Kernel1;

int main() {
constexpr size_t dataSize = 512;
constexpr size_t localSize = 16;
std::array<int, dataSize / localSize> in;
std::array<int, dataSize> out0;
std::array<int, dataSize / 2> out1;
// Needed to check results
size_t sg_size = 0;

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
{
buffer<int> buff_in{in};
buffer<int> buff_out0{out0};
buffer<int> buff_out1{out1};
buffer<size_t> buff_sg_size{&sg_size, 1};

ext::codeplay::experimental::fusion_wrapper fw{q};

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, write_only, no_init);
cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
[=](nd_item<1> i) {
if (i.get_local_id() == 0) {
auto j = i.get_group(0);
in[j] = static_cast<int>(j);
}
});
});

fw.start_fusion();

q.submit([&](handler &cgh) {
accessor sg_size(buff_sg_size, cgh, write_only, no_init);
accessor in(buff_in, cgh, read_only);
accessor out(buff_out0, cgh, write_only, no_init);
cgh.parallel_for<Kernel0>(
nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
sub_group group = i.get_sub_group();
int gid = i.get_global_id();
int sgid = group.get_group_id();
sg_size[0] = group.get_max_local_range()[0];
out[gid] = permute_group_by_xor(
group, gid, sgid % group.get_max_local_range()[0]);
});
});

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, read_only);
accessor out(buff_out1, cgh, write_only, no_init);
cgh.parallel_for<Kernel1>(
nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
sub_group group = i.get_sub_group();
int gid = i.get_global_id();
int sgid = group.get_group_id();
out[gid] = permute_group_by_xor(
group, gid, sgid % group.get_max_local_range()[0]);
});
});

complete_fusion_with_check(fw);
}

// Check the results
int SGid = 0;
int SGLid = 0;
int SGBeginGid = 0;
int j = 0;
const auto check = [sg_size, &SGid, &SGLid, &SGBeginGid, &out0,
&out1](int j, bool checkSmall) {
if (j % localSize % sg_size == 0) {
SGid++;
SGLid = 0;
SGBeginGid = j;
}
if (j % localSize == 0) {
SGid = 0;
SGLid = 0;
SGBeginGid = j;
}
assert(out0[j] == SGBeginGid + (SGLid ^ (SGid % sg_size)));
assert(!checkSmall || (out1[j] == SGBeginGid + (SGLid ^ (SGid % sg_size))));
SGLid++;
};
for (int end = dataSize / 2; j < end; j++) {
check(j, true);
}
for (int end = dataSize; j < end; j++) {
check(j, false);
}

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// RUN: %{build} -fsycl-embed-ir -o %t.out
// RUN: %{run} %t.out

// Test fusion works with group_broadcast and remapping.

#include <sycl/sycl.hpp>

#include "../helpers.hpp"

using namespace sycl;

class FillKernel;
class Kernel0;
class Kernel1;

int main() {
constexpr size_t dataSize = 512;
constexpr size_t localSize = 16;
std::array<int, dataSize / localSize> in;
std::array<int, dataSize> out0;
std::array<int, dataSize / 2> out1;

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
{
buffer<int> buff_in{in};
buffer<int> buff_out0{out0};
buffer<int> buff_out1{out1};

ext::codeplay::experimental::fusion_wrapper fw{q};

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, write_only, no_init);
cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
[=](nd_item<1> i) {
if (i.get_local_id() == 0) {
auto j = i.get_group(0);
in[j] = static_cast<int>(j);
}
});
});

fw.start_fusion();

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, read_only);
accessor out(buff_out0, cgh, write_only, no_init);
cgh.parallel_for<Kernel0>(
nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
auto group = i.get_group();
out[i.get_global_id()] = group_broadcast(
group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
1);
});
});

q.submit([&](handler &cgh) {
accessor in(buff_in, cgh, read_only);
accessor out(buff_out1, cgh, write_only, no_init);
cgh.parallel_for<Kernel1>(
nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
auto group = i.get_group();
out[i.get_global_id()] = group_broadcast(
group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
1);
});
});

complete_fusion_with_check(fw);
}

// Check the results
int i = 0;
for (int end = dataSize / 2; i < end; ++i) {
int group_id = i / static_cast<int>(localSize);
assert(out0[i] == group_id && "Computation error");
assert(out1[i] == group_id && "Computation error");
}

for (int end = dataSize; i < end; ++i) {
int group_id = i / static_cast<int>(localSize);
assert(out0[i] == group_id && "Computation error");
}

return 0;
}
Loading