From ad494e9dd3a7d9a305821f693edc76645bf2ce30 Mon Sep 17 00:00:00 2001 From: Colin Davidson Date: Tue, 10 Sep 2024 09:34:03 +0100 Subject: [PATCH] [SYCL][NATIVECPU] Fix local scope module variables for native cpu (#15280) Although local scope variables inside the kernel are less common in SYCL, they can happen with hierarchical. This fixes the problem by adding a pass to replace the local scope variables which start life as globals with a struct which is allocated on the stack. Additionally, this required updating of the code which renames and removes kernel based on wrappers and vecz success. To simplify this we run the OCK utility pass TransferKernelMetadata which adds metadata to store the original kernel name. This in turn simplifies this code significantly. Note this fixes fails in kernel/kernel_attributes_wg_hint.cpp SYCL CTS for native cpu, which is being tested locally. --- .../PipelineSYCLNativeCPU.cpp | 4 ++ .../PrepareSYCLNativeCPU.cpp | 48 +++++-------------- .../native_cpu/local_module_scope.cpp | 39 +++++++++++++++ 3 files changed, 54 insertions(+), 37 deletions(-) create mode 100644 sycl/test/check_device_code/native_cpu/local_module_scope.cpp diff --git a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp index 581c2f4866c9c..1454c10fc4200 100644 --- a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp @@ -21,7 +21,9 @@ #include "compiler/utils/builtin_info.h" #include "compiler/utils/define_mux_builtins_pass.h" #include "compiler/utils/device_info.h" +#include "compiler/utils/encode_kernel_metadata_pass.h" #include "compiler/utils/prepare_barriers_pass.h" +#include "compiler/utils/replace_local_module_scope_variables_pass.h" #include "compiler/utils/sub_group_analysis.h" #include "compiler/utils/work_item_loops_pass.h" #include "vecz/pass.h" @@ -60,6 +62,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses( OptimizationLevel OptLevel) { MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass()); #ifdef NATIVECPU_USE_OCK + MPM.addPass(compiler::utils::TransferKernelMetadataPass()); // Always enable vectorizer, unless explictly disabled or -O0 is set. if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) { MAM.registerPass([] { return vecz::TargetInfoAnalysis(); }); @@ -87,6 +90,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses( MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); }); MPM.addPass(compiler::utils::PrepareBarriersPass()); MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts)); + MPM.addPass(compiler::utils::ReplaceLocalModuleScopeVariablesPass()); MPM.addPass(AlwaysInlinerPass()); #endif MPM.addPass(PrepareSYCLNativeCPUPass()); diff --git a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp index c5625217bdfd1..b3888db8a7b50 100644 --- a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp @@ -338,47 +338,21 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, SmallSet RemovableFuncs; SmallVector WrapperFuncs; - // Retrieve the wrapper functions created by the WorkItemLoop pass. for (auto &OldF : OldKernels) { - std::optional VeczR = - compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); - if (VeczR && VeczR.value().first) { - WrapperFuncs.push_back(OldF); - } else { - auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF); - if (Name != OldF->getName()) { - WrapperFuncs.push_back(OldF); - } - } - } - - for (auto &OldF : WrapperFuncs) { // If vectorization occurred, at this point we have a wrapper function - // that runs the vectorized kernel and peels using the scalar kernel. We - // make it so this wrapper steals the original kernel name. - std::optional VeczR = - compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); - if (VeczR && VeczR.value().first) { - auto ScalarF = VeczR.value().first; - OldF->takeName(ScalarF); - if (ScalarF->use_empty()) - RemovableFuncs.insert(ScalarF); - } else { - // The WorkItemLoops pass created a wrapper function for the original - // kernel. If we have a kernel named foo(), the wrapper will be called - // foo-wrapper(), and will have the original kernel name retrieved by - // getBaseFnNameOrFnName. We set the name of the wrapper function - // to the original kernel name and add the original kernel to the - // list of functions that can be removed from the module. - auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF); - Function *OrigF = M.getFunction(Name); + // that runs the vectorized kernel and peels using the scalar kernel. + // There may also be a wrapper for local variables replacement. We make it + // so this wrapper steals the original kernel name. Otherwise we will have + // a wrapper function from the work item loops. In this case we also steal + // the original kernel name. + auto Name = compiler::utils::getOrigFnName(*OldF); + Function *OrigF = M.getFunction(Name); + if (Name != OldF->getName()) { if (OrigF != nullptr) { - // The original kernel is inlined by the WorkItemLoops - // pass if it contained barriers or group collectives, otherwise - // we don't want to (and can't) remove it. - if (OrigF->use_empty()) - RemovableFuncs.insert(OrigF); OldF->takeName(OrigF); + if (OrigF->use_empty()) { + RemovableFuncs.insert(OrigF); + } } else { OldF->setName(Name); } diff --git a/sycl/test/check_device_code/native_cpu/local_module_scope.cpp b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp new file mode 100644 index 0000000000000..bb1ea27a115bd --- /dev/null +++ b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp @@ -0,0 +1,39 @@ +// REQUIRES: native_cpu_ock + +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s + +// Check that local types structure is created and placed on the stack +// We also check that the attribute mux-orig-fn is created as this is needed to +// find the original function after this pass is run + +// CHECK: %localVarTypes = type { ptr addrspace(1) } +// CHECK: define void @_ZTS4TestILi1ELi4EiE.NativeCPUKernel{{.*}} #[[ATTR:[0-9]*]] +// CHECK: alloca %localVarTypes +// CHECK: attributes #[[ATTR]] = {{.*}} "mux-orig-fn"="_ZTS4TestILi1ELi4EiE" + +#include "sycl.hpp" + +template struct Test; + +int main() { + sycl::queue queue; + + constexpr int dims = 1; + constexpr int size = 4; + + std::array data; + + const auto range = sycl::range(size); + const auto range_wg = sycl::range(1); + { + sycl::buffer buf(data.data(), range); + + queue.submit([&](sycl::handler &cgh) { + auto acc = sycl::accessor(buf, cgh, sycl::write_only); + cgh.parallel_for_work_group>( + range, range_wg, [=](auto group) { acc[group.get_group_id()] = 42; }); + }); + queue.wait_and_throw(); + } + return 0; +}