intel · sommerlukas · Feb 14, 2024 · Feb 7, 2024
@@ -226,18 +226,15 @@ This remapping consists on an inter-procedural pass replacing each built-in quer
 First of all, work-item remapping will always be performed when the list of input nd-ranges is heterogeneous. Additional remapping conditions are present for the following work-item components. For each input kernel:
 
 - `num_work_groups` and `local_size`: Only performed if the input nd-range has an explicit local size, may result in better performance, as this replaces built-in calls with constants;
-- `global_id`, `local_id` and `group_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
+- `global_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
+- `local_id` and `group_id`: Never needed as per [kernel fusion restrictions](#restrictions). These are invariant after fusion.
 
 Once this rules are set, also taking into account remapping constraints, the remapping is performed as follows for each input kernel:
 
 - `global_id`:
   - `global_id(0) = GLID / (global_size(1) * global_size(2))`
   - `global_id(1) = (GLID / global_size(2)) % global_size(1)`
   - `global_id(2) = GLID % global_size(2)`
-- `local_id`:
-  - `local_id(x) = global_id(x) % local_size(x)`
-- `group_id`:
-  - `group_id(x) = global_id(x) / local_size(x)`
 - `num_work_groups`:
   - `num_work_groups(x) = global_size(x) / local_size(x)`
 - `global_size`:
@@ -348,6 +345,13 @@ q.submit([&](sycl::handler &cgh) {
       sycl::detail::strategy::group_reduce_and_last_wg_detection>(...);
 });
 ```
+### Group Algorithms and Functions
+
+Kernel fusion supports group algorithms and functions. As per [remapping
+rules](#work-item-remapping), group ID and local ID are invariant after fusion
+even when different ND-ranges are involved. This way, group functions and
+algorithms conceptually executed for a given group and using a given local ID
+as, e.g., the `group_broadcast` local ID, will keep semantics after fusion.
 
 ### Unsupported SYCL constructs
 

@@ -0,0 +1,108 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+// Test fusion works with permute and remapping.
+
+#include <sycl/sycl.hpp>
+
+#include "../helpers.hpp"
+#include "sycl/group_algorithm.hpp"
+
+using namespace sycl;
+
+class FillKernel;
+class Kernel0;
+class Kernel1;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  constexpr size_t localSize = 16;
+  std::array<int, dataSize / localSize> in;
+  std::array<int, dataSize> out0;
+  std::array<int, dataSize / 2> out1;
+  // Needed to check results
+  size_t sg_size = 0;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  {
+    buffer<int> buff_in{in};
+    buffer<int> buff_out0{out0};
+    buffer<int> buff_out1{out1};
+    buffer<size_t> buff_sg_size{&sg_size, 1};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, write_only, no_init);
+      cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
+                                   [=](nd_item<1> i) {
+                                     if (i.get_local_id() == 0) {
+                                       auto j = i.get_group(0);
+                                       in[j] = static_cast<int>(j);
+                                     }
+                                   });
+    });
+
+    fw.start_fusion();
+
+    q.submit([&](handler &cgh) {
+      accessor sg_size(buff_sg_size, cgh, write_only, no_init);
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out0, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel0>(
+          nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
+            sub_group group = i.get_sub_group();
+            int gid = i.get_global_id();
+            int sgid = group.get_group_id();
+            sg_size[0] = group.get_max_local_range()[0];
+            out[gid] = permute_group_by_xor(
+                group, gid, sgid % group.get_max_local_range()[0]);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out1, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel1>(
+          nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
+            sub_group group = i.get_sub_group();
+            int gid = i.get_global_id();
+            int sgid = group.get_group_id();
+            out[gid] = permute_group_by_xor(
+                group, gid, sgid % group.get_max_local_range()[0]);
+          });
+    });
+
+    complete_fusion_with_check(fw);
+  }
+
+  // Check the results
+  int SGid = 0;
+  int SGLid = 0;
+  int SGBeginGid = 0;
+  int j = 0;
+  const auto check = [sg_size, &SGid, &SGLid, &SGBeginGid, &out0,
+                      &out1](int j, bool checkSmall) {
+    if (j % localSize % sg_size == 0) {
+      SGid++;
+      SGLid = 0;
+      SGBeginGid = j;
+    }
+    if (j % localSize == 0) {
+      SGid = 0;
+      SGLid = 0;
+      SGBeginGid = j;
+    }
+    assert(out0[j] == SGBeginGid + (SGLid ^ (SGid % sg_size)));
+    assert(!checkSmall || (out1[j] == SGBeginGid + (SGLid ^ (SGid % sg_size))));
+    SGLid++;
+  };
+  for (int end = dataSize / 2; j < end; j++) {
+    check(j, true);
+  }
+  for (int end = dataSize; j < end; j++) {
+    check(j, false);
+  }
+
+  return 0;
+}
@@ -0,0 +1,85 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+// Test fusion works with group_broadcast and remapping.
+
+#include <sycl/sycl.hpp>
+
+#include "../helpers.hpp"
+
+using namespace sycl;
+
+class FillKernel;
+class Kernel0;
+class Kernel1;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  constexpr size_t localSize = 16;
+  std::array<int, dataSize / localSize> in;
+  std::array<int, dataSize> out0;
+  std::array<int, dataSize / 2> out1;
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  {
+    buffer<int> buff_in{in};
+    buffer<int> buff_out0{out0};
+    buffer<int> buff_out1{out1};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, write_only, no_init);
+      cgh.parallel_for<FillKernel>(nd_range<1>{{dataSize}, {localSize}},
+                                   [=](nd_item<1> i) {
+                                     if (i.get_local_id() == 0) {
+                                       auto j = i.get_group(0);
+                                       in[j] = static_cast<int>(j);
+                                     }
+                                   });
+    });
+
+    fw.start_fusion();
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out0, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel0>(
+          nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
+            auto group = i.get_group();
+            out[i.get_global_id()] = group_broadcast(
+                group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
+                1);
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out1, cgh, write_only, no_init);
+      cgh.parallel_for<Kernel1>(
+          nd_range<1>{{dataSize / 2}, {localSize}}, [=](nd_item<1> i) {
+            auto group = i.get_group();
+            out[i.get_global_id()] = group_broadcast(
+                group, i.get_local_id() == 1 ? in[group.get_group_id(0)] : -1,
+                1);
+          });
+    });
+
+    complete_fusion_with_check(fw);
+  }
+
+  // Check the results
+  int i = 0;
+  for (int end = dataSize / 2; i < end; ++i) {
+    int group_id = i / static_cast<int>(localSize);
+    assert(out0[i] == group_id && "Computation error");
+    assert(out1[i] == group_id && "Computation error");
+  }
+
+  for (int end = dataSize; i < end; ++i) {
+    int group_id = i / static_cast<int>(localSize);
+    assert(out0[i] == group_id && "Computation error");
+  }
+
+  return 0;
+}