diff --git a/sycl-fusion/common/lib/NDRangesHelper.cpp b/sycl-fusion/common/lib/NDRangesHelper.cpp index 96f26d96a4ea5..7c418022ef4ee 100644 --- a/sycl-fusion/common/lib/NDRangesHelper.cpp +++ b/sycl-fusion/common/lib/NDRangesHelper.cpp @@ -170,6 +170,18 @@ jit_compiler::FusedNDRange::get(ArrayRef NDRanges) { "Cannot fuse kernels whose fusion would " "yield non-uniform work-group sizes"); } + + // Work-items in the same work-group in the original ND-ranges must be in + // the same work-group in the fused one. + if (LocalSize && any_of(NDRanges, [&Fused](const NDRange &NDR) { + return NDR.hasSpecificLocalSize() && requireIDRemapping(Fused, NDR); + })) { + return createStringError( + inconvertibleErrorCode(), + "Cannot fuse kernels when any of the fused kernels with a specific " + "local size has different global sizes in dimensions [2, N) or " + "different number of dimensions"); + } } return FusedNDRange{Fused, IsHeterogeneousList, NDRanges}; diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index b83a8a26eeae0..224a1984d2902 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -212,7 +212,8 @@ These restrictions can be simplified to: - No two local sizes specified by the nd-ranges will be different; - No global id remapping is needed ([see](#work-item-remapping)) or all input offsets are 0; -- All the fused nd-ranges must have the same offset. +- All the fused nd-ranges must have the same offset; +- No global id remapping is needed for kernels specifying a local size. As we can see, there is no restrictions in the number of dimensions or global sizes of the input nd-ranges. diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp index 709befa514915..930fcc12eff86 100644 --- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp +++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp @@ -15,14 +15,19 @@ enum class Internalization { None, Local, Private }; template size_t getSize(Range r); -template <> size_t getSize(range<1> r) { return r.size(); } -template <> size_t getSize(nd_range<1> r) { +template size_t getSize(range r) { + return r.size(); +} +template size_t getSize(nd_range r) { return r.get_global_range().size(); } template void performFusion(queue &q, Range1 R1, Range2 R2) { + using IndexTy1 = item; + using IndexTy2 = item; + int in[dataSize], tmp[dataSize], out[dataSize]; for (size_t i = 0; i < dataSize; ++i) { @@ -43,15 +48,19 @@ void performFusion(queue &q, Range1 R1, Range2 R2) { q.submit([&](handler &cgh) { auto accIn = bIn.get_access(cgh); auto accTmp = bTmp.get_access(cgh); - cgh.parallel_for( - R1, [=](item<1> i) { accTmp[i] = accIn[i] + 5; }); + cgh.parallel_for(R1, [=](IndexTy1 i) { + size_t j = i.get_linear_id(); + accTmp[j] = accIn[j] + 5; + }); }); q.submit([&](handler &cgh) { auto accTmp = bTmp.get_access(cgh); auto accOut = bOut.get_access(cgh); - cgh.parallel_for( - R2, [=](id<1> i) { accOut[i] = accTmp[i] * 2; }); + cgh.parallel_for(R2, [=](IndexTy2 i) { + size_t j = i.get_linear_id(); + accOut[j] = accTmp[j] * 2; + }); }); fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); @@ -117,5 +126,17 @@ int main() { // CHECK-NEXT: Cannot fuse kernels whose fusion would yield non-uniform work-group sizes // CHECK: COMPUTATION OK + // Scenario: Fusing two kernels that may lead to synchronization issues as two + // work-items in the same work-group may not be in the same work-group in the + // fused ND-range. + performFusion( + q, nd_range<2>{range<2>{2, 2}, range<2>{2, 2}}, + nd_range<2>{range<2>{4, 4}, range<2>{2, 2}}); + // CHECK: ERROR: JIT compilation for kernel fusion failed with message: + // CHECK-NEXT: Illegal ND-range combination + // CHECK-NEXT: Detailed information: + // CHECK-NEXT: Cannot fuse kernels when any of the fused kernels with a specific local size has different global sizes in dimensions [2, N) or different number of dimensions + // CHECK: COMPUTATION OK + return 0; }