[SYCL] [NATIVECPU] Integrate OneAPI Construction Kit vectorizer (#12659)

This PR enables Whole Function Vectorization for the Native CPU backend by integrating the OneAPI Construction Kit's vectorizer.
intel · Feb 28, 2024 · 330ac57 · 330ac57
1 parent d7fb328
commit 330ac57
Show file tree

Hide file tree

Showing 11 changed files with 199 additions and 57 deletions.
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
@@ -303,7 +303,7 @@ LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the "
         "SYCL integration header")
 LANGOPT(SYCLAllowVirtualFunctions, 1, 0,
         "Allow virtual functions calls in code for SYCL device")
-LANGOPT(SYCLIsNativeCPU      , 1, 0, "Generate code for SYCL NativeCPU")
+LANGOPT(SYCLIsNativeCPU      , 1, 0, "Generate code for SYCL Native CPU")
 
 LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
 LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)")

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1086,7 +1086,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (SYCLNativeCPUBackend) {
-      sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM);
+      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM, Level);
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());

diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
@@ -12,13 +12,15 @@
 #pragma once
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Passes/OptimizationLevel.h"
 
 namespace llvm {
 namespace sycl {
 namespace utils {
 
 void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
-                                   ModuleAnalysisManager &MAM);
+                                   ModuleAnalysisManager &MAM,
+                                   OptimizationLevel OptLevel);
 const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id";
 const constexpr char NativeCPUGlobaRange[] =
     "__dpcpp_nativecpu_get_global_range";

diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -11,46 +11,87 @@
 // When NATIVECPU_USE_OCK is set, adds passes from the oneAPI Construction Kit.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"
+#include "compiler/utils/device_info.h"
 #include "compiler/utils/prepare_barriers_pass.h"
 #include "compiler/utils/sub_group_analysis.h"
 #include "compiler/utils/work_item_loops_pass.h"
+#include "vecz/pass.h"
+#include "vecz/vecz_target_info.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #endif
 
 using namespace llvm;
 using namespace sycl::utils;
 
-cl::opt<bool>
+static cl::opt<bool>
     ForceNoTail("native-cpu-force-no-tail", cl::init(false),
                 cl::desc("Never emit the peeling loop for vectorized kernels,"
                          "even when the local size is not known to be a "
                          "multiple of the vector width"));
 
-cl::opt<bool> IsDebug(
+static cl::opt<bool> IsDebug(
     "native-cpu-debug", cl::init(false),
     cl::desc("Emit extra alloca instructions to preserve the value of live"
              "variables between barriers"));
 
+static cl::opt<unsigned> NativeCPUVeczWidth(
+    "sycl-native-cpu-vecz-width", cl::init(8),
+    cl::desc("Vector width for SYCL Native CPU vectorizer, defaults to 8"));
+
+static cl::opt<bool>
+    SYCLNativeCPUNoVecz("sycl-native-cpu-no-vecz", cl::init(false),
+                        cl::desc("Disable vectorizer for SYCL Native CPU"));
+
 void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
-    llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM) {
+    llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM,
+    OptimizationLevel OptLevel) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
+  // Always enable vectorizer, unless explictly disabled or -O0 is set.
+  if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) {
+    MAM.registerPass([] { return vecz::TargetInfoAnalysis(); });
+    MAM.registerPass([] { return compiler::utils::DeviceInfoAnalysis(); });
+    auto QueryFunc =
+        [](const llvm::Function &F, const llvm::ModuleAnalysisManager &,
+           llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) -> bool {
+      if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
+        return false;
+      }
+      compiler::utils::VectorizationFactor VF(NativeCPUVeczWidth, false);
+      vecz::VeczPassOptions VPO;
+      VPO.factor = std::move(VF);
+      Opts.emplace_back(std::move(VPO));
+      return true;
+    };
+    MAM.registerPass(
+        [QueryFunc] { return vecz::VeczPassOptionsAnalysis(QueryFunc); });
+    MPM.addPass(vecz::RunVeczPass());
+  }
   compiler::utils::WorkItemLoopsPassOptions Opts;
   Opts.IsDebug = IsDebug;
   Opts.ForceNoTail = ForceNoTail;
-  MAM.registerPass([&] { return compiler::utils::BuiltinInfoAnalysis(); });
-  MAM.registerPass([&] { return compiler::utils::SubgroupAnalysis(); });
+  MAM.registerPass([] { return compiler::utils::BuiltinInfoAnalysis(); });
+  MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); });
   MPM.addPass(compiler::utils::PrepareBarriersPass());
   MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
   MPM.addPass(AlwaysInlinerPass());
 #endif
   MPM.addPass(PrepareSYCLNativeCPUPass());
   MPM.addPass(RenameKernelSYCLNativeCPUPass());
+
+  // Run optimization passes after all the changes we made to the kernels.
+  // Todo: check optimization level from clang
+  // Todo: maybe we could find a set of relevant passes instead of re-running
+  // the full optimization pipeline.
+  PassBuilder PB;
+  MPM.addPass(PB.buildPerModuleDefaultPipeline(OptLevel));
 }
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
-#include "llvm/BinaryFormat/MsgPack.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -30,28 +29,20 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include <functional>
-#include <numeric>
-#include <set>
 #include <utility>
 #include <vector>
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/attributes.h"
 #include "compiler/utils/builtin_info.h"
+#include "compiler/utils/metadata.h"
 #endif
 
 using namespace llvm;
@@ -317,6 +308,8 @@ static Function *getReplaceFunc(Module &M, StringRef Name, Type *StateType) {
 }
 
 static Value *getStateArg(Function *F, llvm::Constant *StateTLS) {
+  // Todo: we should probably cache the state thread local load here
+  // to avoid re-emitting it for each builtin
   if (StateTLS) {
     IRBuilder<> BB(&*F->getEntryBlock().getFirstInsertionPt());
     llvm::Value *V = BB.CreateThreadLocalAddress(StateTLS);
@@ -353,6 +346,18 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
 
   CurrentStatePointerTLS = nullptr;
 
+  // check if any of the kernels is called by some other function.
+  // This can happen e.g. with OCK, where wrapper functions are
+  // created around the original kernel.
+  bool KernelIsCalled = false;
+  for (auto &K : OldKernels) {
+    for (auto &U : K->uses()) {
+      if (isa<CallBase>(U.getUser())) {
+        KernelIsCalled = true;
+      }
+    }
+  }
+
   // Then we iterate over all the supported builtins, find the used ones
   llvm::SmallVector<std::pair<llvm::Function *, StringRef>> UsedBuiltins;
   for (const auto &Entry : BuiltinNamesMap) {
@@ -361,9 +366,9 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
       continue;
     for (const auto &Use : Glob->uses()) {
       auto *I = cast<CallInst>(Use.getUser());
-      if (!IsNativeCPUKernel(I->getFunction())) {
+      if (!IsNativeCPUKernel(I->getFunction()) || KernelIsCalled) {
         // only use the threadlocal if we have kernels calling builtins
-        // indirectly
+        // indirectly, or if the kernel is called by some other func.
         if (CurrentStatePointerTLS == nullptr)
           CurrentStatePointerTLS = M.getOrInsertGlobal(
               STATE_TLS_NAME, StatePtrType, [&M, StatePtrType]() {
@@ -388,12 +393,18 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
   SmallVector<Function *> NewKernels;
   for (auto &OldF : OldKernels) {
 #ifdef NATIVECPU_USE_OCK
-    // The OCK creates a wrapper function around the original kernel with
-    // the WorkItemLoopsPass.
-    // At runtime, we want to run the wrapper function, therefore we
-    // make it so the wrapper steals the original kernel name.
     auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
-    if (Name != OldF->getName()) {
+    OldF->setName(Name);
+    // if vectorization occurred, at this point we have a wrapper function that
+    // runs the vectorized kernel and peels using the scalar kernel. We make it
+    // so this wrapper steals the original kernel name.
+    std::optional<compiler::utils::LinkMetadataResult> veczR =
+        compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
+    if (veczR) {
+      auto ScalarF = veczR.value().first;
+      OldF->takeName(ScalarF);
+      ScalarF->setName(OldF->getName() + "_scalar");
+    } else if (Name != OldF->getName()) {
       auto RealKernel = M.getFunction(Name);
       if (RealKernel) {
         // the real kernel was not inlined in the wrapper, steal its name

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
@@ -69,11 +69,20 @@ cmake \
 
 Note that a number of `e2e` tests are currently still failing.
 
+# Vectorization
+
+With the integration of the OneAPI Construction Kit, the SYCL Native CPU target
+also gained support for Whole Function Vectorization.\\
+Whole Function Vectorization is enabled by default, and can be controlled through these compiler options:
+* `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
+* `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.
+
+For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#technical-details) section.
+
 ## Ongoing work
 
 * Complete support for remaining SYCL features, including but not limited to
   * math and other builtins
-* Vectorization (e.g. Whole Function Vectorization)
 * Subgroup support
 * Performance optimizations
 
@@ -169,12 +178,55 @@ entry:
   ret void
 }
 ```
+
 As you can see, the `subhandler` steals the kernel's function name, and receives two pointer arguments: the first one points to the kernel arguments from the SYCL runtime, and the second one to the `__nativecpu_state` struct.
 
 ## Handling barriers 
 
 On SYCL Native CPU, calls to `__spirv_ControlBarrier` are handled using the `WorkItemLoopsPass` from the oneAPI Construction Kit. This pass handles barriers by splitting the kernel between calls calls to `__spirv_ControlBarrier`, and creating a wrapper that runs the subkernels over the local range. In order to correctly interface to the oneAPI Construction Kit pass pipeline, SPIRV builtins are converted to `mux` builtins (used by the OCK) by the `ConvertToMuxBuiltinsSYCLNativeCPUPass`.
 
+## Vectorization
+
+The OneAPI Construction Kit's Whole Function Vectorizer is executed as an LLVM Pass. Considering the following input function:
+
+```llvm
+define void @SimpleVadd(i32*, i32*, i32*) {
+  %5 = call i64 @_Z13get_global_idj(i32 0)
+  %6 = getelementptr inbounds i32, ptr %1, i64 %5
+  %7 = load i32, ptr %6, align 4
+  %8 = getelementptr inbounds i32, ptr %2, i64 %5
+  %9 = load i32, ptr %8, align 4
+  %10 = add nsw i32 %9, %7
+  %11 = getelementptr inbounds i32, ptr %0, i64 %5
+  store i32 %10, ptr %11, align 4
+  ret void
+}
+```
+
+With a vector width of 8, the vectorizer will produce:
+
+```llvm
+define void @__vecz_v8_SimpleVadd(i32*, i32*, i32*) !codeplay_ca_vecz.derived !2 {
+  %5 = call i64 @_Z13get_global_idj(i32 0)
+  %6 = getelementptr inbounds i32, ptr %1, i64 %5
+  %7 = load <8 x i32>, ptr %6, align 4
+  %8 = getelementptr inbounds i32, ptr %2, i64 %5
+  %9 = load <8 x i32>, ptr %8, align 4
+  %10 = add nsw <8 x i32> %9, %7
+  %11 = getelementptr inbounds i32, ptr %0, i64 %5
+  store <8 x i32> %12, ptr %11, align 4
+  ret void
+}
+!1 = !{i32 8, i32 0, i32 0, i32 0}
+!2 = !{!1, ptr @_ZTSN4sycl3_V16detail19__pf_kernel_wrapperI10SimpleVaddEE}
+```
+
+The `__vecz_v8_SimpleVadd` function is the vectorized version of the original function. It receives arguments of the same type,
+and has the `codeplay_ca_vecz.derived` metadata node attached. The metadata node contains information about the vectorization width,
+and points to the original version of the function. This information is used later in the pass pipeline by the `WorkItemLoopsPass`,
+which will account for the vectorization when creating the Work Item Loops, and use the original version of the function to add
+peeling loops.
+
 ## Kernel registration
 
 In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the SYCL Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array.
@@ -189,7 +241,7 @@ In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied
     BinaryStart                         BinaryEnd  
 ```
 
-Each entry in the array contains the kernel name as a string, and a pointer to the `sunhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module.
+Each entry in the array contains the kernel name as a string, and a pointer to the `subhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module.
 
 ## Kernel lowering and execution
 

diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -28,13 +28,12 @@ if(NATIVECPU_USE_OCK)
   include(FetchContent)
   FetchContent_Declare(oneapi-ck
    GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git
-   # commit 558b76cd6080107a2d65f7e37468f3d01eef80f7
-   # Merge: a5ec372f 542ee121
-   # Author: PietroGhg <38155419+PietroGhg@users.noreply.github.com>
-   # Date:   Tue Jan 30 12:53:39 2024 +0100
-   # Merge pull request #322 from PietroGhg/pietro/update_30_jan
-   # Update sycl_native_experimental branch
-   GIT_TAG 558b76cd6080107a2d65f7e37468f3d01eef80f7
+   #  commit 63f4ba99fc758ffc4268a807b21816b6be1b1b68
+   #  Author: PietroGhg <38155419+PietroGhg@users.noreply.github.com>
+   #  Date:   Wed Jan 31 14:06:36 2024 +0100
+   #  Merge pull request #326 from PietroGhg/pietro/vecz
+   #  Integrate vecz in experimental branch
+   GIT_TAG 63f4ba99fc758ffc4268a807b21816b6be1b1b68
   )
   FetchContent_GetProperties(oneapi-ck)
   if(NOT oneapi-ck_POPULATED)
@@ -48,8 +47,9 @@ if(NATIVECPU_USE_OCK)
   target_include_directories(LLVMSYCLLowerIR PRIVATE 
     ${oneapi-ck_SOURCE_DIR}/modules/compiler/multi_llvm/include
     ${oneapi-ck_SOURCE_DIR}/modules/cargo/include
+    ${oneapi-ck_SOURCE_DIR}/modules/compiler/vecz/include
     ${oneapi-ck_SOURCE_DIR}/modules/compiler/utils/include)
-  target_link_libraries(LLVMSYCLLowerIR PRIVATE compiler-pipeline)
+  target_link_libraries(LLVMSYCLLowerIR PRIVATE compiler-pipeline vecz)
   target_compile_definitions(pi_native_cpu PRIVATE NATIVECPU_USE_OCK)
 
 endif()

diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
@@ -27,7 +27,7 @@ int main() {
   sycl::range<1> r(1);
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
-    // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
+    // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}, ptr {{.*}}, ptr addrspace(1){{.*}})
     // CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
     // CHECK-NOT: @llvm.threadlocal
 

diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp
@@ -0,0 +1,24 @@
+// REQUIRES: native_cpu_be
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=4 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-4
+// RUN: %clangxx -O0 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-O0
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-no-vecz -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DISABLE
+#include <sycl/sycl.hpp>
+class Test1;
+int main() {
+  sycl::queue deviceQueue;
+  sycl::accessor<int, 1, sycl::access::mode::write> acc;
+  sycl::range<1> r(1);
+  deviceQueue.submit([&](sycl::handler &h) {
+    h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
+    // CHECK-DEFAULT: store <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+    // CHECK-16: store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+    // CHECK-4: store <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+    // CHECK-O0: store i32 42
+    // CHECK-O0-NOT: store <{{.*}}>
+    // CHECK-DISABLE: store i32 42
+    // CHECK-DISABLE-NOT: store <{{.*}}>
+  });
+}