[SYCL][Fusion] Test and document group algorithms/function support (#…

…12644) Add tests for `group_broadcast` and `permute_group_by_xor` when remapping is needed. Document support for group algorithms and functions in the kernel fusion design document. Kernel fusion supports these constructs, as local range, group id and local id do not change due to remapping. Signed-off-by: Victor Perez <victor.perez@codeplay.com>
intel · Feb 14, 2024 · ca4ed6e · ca4ed6e
1 parent 7e414a9
commit ca4ed6e
Show file tree

Hide file tree

Showing 3 changed files with 202 additions and 5 deletions.
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
@@ -226,18 +226,15 @@ This remapping consists on an inter-procedural pass replacing each built-in quer
 First of all, work-item remapping will always be performed when the list of input nd-ranges is heterogeneous. Additional remapping conditions are present for the following work-item components. For each input kernel:
 
 - `num_work_groups` and `local_size`: Only performed if the input nd-range has an explicit local size, may result in better performance, as this replaces built-in calls with constants;
-- `global_id`, `local_id` and `group_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
+- `global_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
+- `local_id` and `group_id`: Never needed as per [kernel fusion restrictions](#restrictions). These are invariant after fusion.
 
 Once this rules are set, also taking into account remapping constraints, the remapping is performed as follows for each input kernel:
 
 - `global_id`:
   - `global_id(0) = GLID / (global_size(1) * global_size(2))`
   - `global_id(1) = (GLID / global_size(2)) % global_size(1)`
   - `global_id(2) = GLID % global_size(2)`
-- `local_id`:
-  - `local_id(x) = global_id(x) % local_size(x)`
-- `group_id`:
-  - `group_id(x) = global_id(x) / local_size(x)`
 - `num_work_groups`:
   - `num_work_groups(x) = global_size(x) / local_size(x)`
 - `global_size`:
@@ -348,6 +345,13 @@ q.submit([&](sycl::handler &cgh) {
       sycl::detail::strategy::group_reduce_and_last_wg_detection>(...);
 });
 ```
+### Group Algorithms and Functions
+
+Kernel fusion supports group algorithms and functions. As per [remapping
+rules](#work-item-remapping), group ID and local ID are invariant after fusion
+even when different ND-ranges are involved. This way, group functions and
+algorithms conceptually executed for a given group and using a given local ID
+as, e.g., the `group_broadcast` local ID, will keep semantics after fusion.
 
 ### Unsupported SYCL constructs
 

diff --git a/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp b/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp
@@ -0,0 +1,108 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+// Test fusion works with permute and remapping.
+
+#include <sycl/sycl.hpp>
+
+#include "../helpers.hpp"
+#include "sycl/group_algorithm.hpp"
+
+using namespace sycl;
+
+class FillKernel;
+class Kernel0;
+class Kernel1;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  constexpr size_t localSize = 16;
+  std::array<int, dataSize / localSize> in;
+  std::array<int, dataSize> out0;
+  std::array<int, dataSize / 2> out1;
+  // Needed to check results
+  size_t sg_size = 0;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  {
+    buffer<int> buff_in{in};
+    buffer<int> buff_out0{out0};
+    buffer<int> buff_out1{out1};
+    buffer<size_t> buff_sg_size{&sg_size, 1};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, write_only, no_init);
+      cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
+                                   [=](nd_item<1> i) {
+                                     if (i.get_local_id() == 0) {
+                                       auto j = i.get_group(0);
+                                       in[j] = static_cast<int>(j);
+                                     }
+                                   });
+    });
+
+    fw.start_fusion();
+
+    q.submit([&](handler &cgh) {
+      accessor sg_size(buff_sg_size, cgh, write_only, no_init);
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out0, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel0>(
+          nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
+            sub_group group = i.get_sub_group();
+            int gid = i.get_global_id();
+            int sgid = group.get_group_id();
+            sg_size[0] = group.get_max_local_range()[0];
+            out[gid] = permute_group_by_xor(
+                group, gid, sgid % group.get_max_local_range()[0]);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out1, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel1>(
+          nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
+            sub_group group = i.get_sub_group();
+            int gid = i.get_global_id();
+            int sgid = group.get_group_id();
+            out[gid] = permute_group_by_xor(
+                group, gid, sgid % group.get_max_local_range()[0]);
+          });
+    });
+
+    complete_fusion_with_check(fw);
+  }
+
+  // Check the results
+  int SGid = 0;
+  int SGLid = 0;
+  int SGBeginGid = 0;
+  int j = 0;
+  const auto check = [sg_size, &SGid, &SGLid, &SGBeginGid, &out0,
+                      &out1](int j, bool checkSmall) {
+    if (j % localSize % sg_size == 0) {
+      SGid++;
+      SGLid = 0;
+      SGBeginGid = j;
+    }
+    if (j % localSize == 0) {
+      SGid = 0;
+      SGLid = 0;
+      SGBeginGid = j;
+    }
+    assert(out0[j] == SGBeginGid + (SGLid ^ (SGid % sg_size)));
+    assert(!checkSmall || (out1[j] == SGBeginGid + (SGLid ^ (SGid % sg_size))));
+    SGLid++;
+  };
+  for (int end = dataSize / 2; j < end; j++) {
+    check(j, true);
+  }
+  for (int end = dataSize; j < end; j++) {
+    check(j, false);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp b/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp
@@ -0,0 +1,85 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+// Test fusion works with group_broadcast and remapping.
+
+#include <sycl/sycl.hpp>
+
+#include "../helpers.hpp"
+
+using namespace sycl;
+
+class FillKernel;
+class Kernel0;
+class Kernel1;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  constexpr size_t localSize = 16;
+  std::array<int, dataSize / localSize> in;
+  std::array<int, dataSize> out0;
+  std::array<int, dataSize / 2> out1;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  {
+    buffer<int> buff_in{in};
+    buffer<int> buff_out0{out0};
+    buffer<int> buff_out1{out1};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, write_only, no_init);
+      cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
+                                   [=](nd_item<1> i) {
+                                     if (i.get_local_id() == 0) {
+                                       auto j = i.get_group(0);
+                                       in[j] = static_cast<int>(j);
+                                     }
+                                   });
+    });
+
+    fw.start_fusion();
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out0, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel0>(
+          nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
+            auto group = i.get_group();
+            out[i.get_global_id()] = group_broadcast(
+                group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
+                1);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out1, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel1>(
+          nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
+            auto group = i.get_group();
+            out[i.get_global_id()] = group_broadcast(
+                group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
+                1);
+          });
+    });
+
+    complete_fusion_with_check(fw);
+  }
+
+  // Check the results
+  int i = 0;
+  for (int end = dataSize / 2; i < end; ++i) {
+    int group_id = i / static_cast<int>(localSize);
+    assert(out0[i] == group_id && "Computation error");
+    assert(out1[i] == group_id && "Computation error");
+  }
+
+  for (int end = dataSize; i < end; ++i) {
+    int group_id = i / static_cast<int>(localSize);
+    assert(out0[i] == group_id && "Computation error");
+  }
+
+  return 0;
+}