Skip to content

Commit

Permalink
[SYCL] [NATIVECPU] Integrate OneAPI Construction Kit vectorizer (#12659)
Browse files Browse the repository at this point in the history
This PR enables Whole Function Vectorization for the Native CPU backend
by integrating the OneAPI Construction Kit's vectorizer.
  • Loading branch information
PietroGhg authored Feb 28, 2024
1 parent d7fb328 commit 330ac57
Show file tree
Hide file tree
Showing 11 changed files with 199 additions and 57 deletions.
2 changes: 1 addition & 1 deletion clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the "
"SYCL integration header")
LANGOPT(SYCLAllowVirtualFunctions, 1, 0,
"Allow virtual functions calls in code for SYCL device")
LANGOPT(SYCLIsNativeCPU , 1, 0, "Generate code for SYCL NativeCPU")
LANGOPT(SYCLIsNativeCPU , 1, 0, "Generate code for SYCL Native CPU")

LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)")
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/CodeGen/BackendUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
}

if (SYCLNativeCPUBackend) {
sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM);
llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM, Level);
}
if (LangOpts.SYCLIsDevice) {
MPM.addPass(SYCLMutatePrintfAddrspacePass());
Expand Down
4 changes: 3 additions & 1 deletion llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
#pragma once
#include "llvm/ADT/Twine.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Passes/OptimizationLevel.h"

namespace llvm {
namespace sycl {
namespace utils {

void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
ModuleAnalysisManager &MAM);
ModuleAnalysisManager &MAM,
OptimizationLevel OptLevel);
const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id";
const constexpr char NativeCPUGlobaRange[] =
"__dpcpp_nativecpu_get_global_range";
Expand Down
51 changes: 46 additions & 5 deletions llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,46 +11,87 @@
// When NATIVECPU_USE_OCK is set, adds passes from the oneAPI Construction Kit.
//
//===----------------------------------------------------------------------===//
#include "llvm/Passes/OptimizationLevel.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
#include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
#include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"

#ifdef NATIVECPU_USE_OCK
#include "compiler/utils/builtin_info.h"
#include "compiler/utils/device_info.h"
#include "compiler/utils/prepare_barriers_pass.h"
#include "compiler/utils/sub_group_analysis.h"
#include "compiler/utils/work_item_loops_pass.h"
#include "vecz/pass.h"
#include "vecz/vecz_target_info.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#endif

using namespace llvm;
using namespace sycl::utils;

cl::opt<bool>
static cl::opt<bool>
ForceNoTail("native-cpu-force-no-tail", cl::init(false),
cl::desc("Never emit the peeling loop for vectorized kernels,"
"even when the local size is not known to be a "
"multiple of the vector width"));

cl::opt<bool> IsDebug(
static cl::opt<bool> IsDebug(
"native-cpu-debug", cl::init(false),
cl::desc("Emit extra alloca instructions to preserve the value of live"
"variables between barriers"));

static cl::opt<unsigned> NativeCPUVeczWidth(
"sycl-native-cpu-vecz-width", cl::init(8),
cl::desc("Vector width for SYCL Native CPU vectorizer, defaults to 8"));

static cl::opt<bool>
SYCLNativeCPUNoVecz("sycl-native-cpu-no-vecz", cl::init(false),
cl::desc("Disable vectorizer for SYCL Native CPU"));

void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM) {
llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM,
OptimizationLevel OptLevel) {
MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
#ifdef NATIVECPU_USE_OCK
// Always enable vectorizer, unless explictly disabled or -O0 is set.
if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) {
MAM.registerPass([] { return vecz::TargetInfoAnalysis(); });
MAM.registerPass([] { return compiler::utils::DeviceInfoAnalysis(); });
auto QueryFunc =
[](const llvm::Function &F, const llvm::ModuleAnalysisManager &,
llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) -> bool {
if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
return false;
}
compiler::utils::VectorizationFactor VF(NativeCPUVeczWidth, false);
vecz::VeczPassOptions VPO;
VPO.factor = std::move(VF);
Opts.emplace_back(std::move(VPO));
return true;
};
MAM.registerPass(
[QueryFunc] { return vecz::VeczPassOptionsAnalysis(QueryFunc); });
MPM.addPass(vecz::RunVeczPass());
}
compiler::utils::WorkItemLoopsPassOptions Opts;
Opts.IsDebug = IsDebug;
Opts.ForceNoTail = ForceNoTail;
MAM.registerPass([&] { return compiler::utils::BuiltinInfoAnalysis(); });
MAM.registerPass([&] { return compiler::utils::SubgroupAnalysis(); });
MAM.registerPass([] { return compiler::utils::BuiltinInfoAnalysis(); });
MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); });
MPM.addPass(compiler::utils::PrepareBarriersPass());
MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
MPM.addPass(AlwaysInlinerPass());
#endif
MPM.addPass(PrepareSYCLNativeCPUPass());
MPM.addPass(RenameKernelSYCLNativeCPUPass());

// Run optimization passes after all the changes we made to the kernels.
// Todo: check optimization level from clang
// Todo: maybe we could find a set of relevant passes instead of re-running
// the full optimization pipeline.
PassBuilder PB;
MPM.addPass(PB.buildPerModuleDefaultPipeline(OptLevel));
}
45 changes: 28 additions & 17 deletions llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
//===----------------------------------------------------------------------===//

#include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
#include "llvm/BinaryFormat/MsgPack.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/DebugInfoMetadata.h"
Expand All @@ -30,28 +29,20 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <functional>
#include <numeric>
#include <set>
#include <utility>
#include <vector>

#ifdef NATIVECPU_USE_OCK
#include "compiler/utils/attributes.h"
#include "compiler/utils/builtin_info.h"
#include "compiler/utils/metadata.h"
#endif

using namespace llvm;
Expand Down Expand Up @@ -317,6 +308,8 @@ static Function *getReplaceFunc(Module &M, StringRef Name, Type *StateType) {
}

static Value *getStateArg(Function *F, llvm::Constant *StateTLS) {
// Todo: we should probably cache the state thread local load here
// to avoid re-emitting it for each builtin
if (StateTLS) {
IRBuilder<> BB(&*F->getEntryBlock().getFirstInsertionPt());
llvm::Value *V = BB.CreateThreadLocalAddress(StateTLS);
Expand Down Expand Up @@ -353,6 +346,18 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,

CurrentStatePointerTLS = nullptr;

// check if any of the kernels is called by some other function.
// This can happen e.g. with OCK, where wrapper functions are
// created around the original kernel.
bool KernelIsCalled = false;
for (auto &K : OldKernels) {
for (auto &U : K->uses()) {
if (isa<CallBase>(U.getUser())) {
KernelIsCalled = true;
}
}
}

// Then we iterate over all the supported builtins, find the used ones
llvm::SmallVector<std::pair<llvm::Function *, StringRef>> UsedBuiltins;
for (const auto &Entry : BuiltinNamesMap) {
Expand All @@ -361,9 +366,9 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
continue;
for (const auto &Use : Glob->uses()) {
auto *I = cast<CallInst>(Use.getUser());
if (!IsNativeCPUKernel(I->getFunction())) {
if (!IsNativeCPUKernel(I->getFunction()) || KernelIsCalled) {
// only use the threadlocal if we have kernels calling builtins
// indirectly
// indirectly, or if the kernel is called by some other func.
if (CurrentStatePointerTLS == nullptr)
CurrentStatePointerTLS = M.getOrInsertGlobal(
STATE_TLS_NAME, StatePtrType, [&M, StatePtrType]() {
Expand All @@ -388,12 +393,18 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
SmallVector<Function *> NewKernels;
for (auto &OldF : OldKernels) {
#ifdef NATIVECPU_USE_OCK
// The OCK creates a wrapper function around the original kernel with
// the WorkItemLoopsPass.
// At runtime, we want to run the wrapper function, therefore we
// make it so the wrapper steals the original kernel name.
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
if (Name != OldF->getName()) {
OldF->setName(Name);
// if vectorization occurred, at this point we have a wrapper function that
// runs the vectorized kernel and peels using the scalar kernel. We make it
// so this wrapper steals the original kernel name.
std::optional<compiler::utils::LinkMetadataResult> veczR =
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
if (veczR) {
auto ScalarF = veczR.value().first;
OldF->takeName(ScalarF);
ScalarF->setName(OldF->getName() + "_scalar");
} else if (Name != OldF->getName()) {
auto RealKernel = M.getFunction(Name);
if (RealKernel) {
// the real kernel was not inlined in the wrapper, steal its name
Expand Down
56 changes: 54 additions & 2 deletions sycl/doc/design/SYCLNativeCPU.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,20 @@ cmake \

Note that a number of `e2e` tests are currently still failing.

# Vectorization

With the integration of the OneAPI Construction Kit, the SYCL Native CPU target
also gained support for Whole Function Vectorization.\\
Whole Function Vectorization is enabled by default, and can be controlled through these compiler options:
* `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
* `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.

For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#technical-details) section.

## Ongoing work

* Complete support for remaining SYCL features, including but not limited to
* math and other builtins
* Vectorization (e.g. Whole Function Vectorization)
* Subgroup support
* Performance optimizations

Expand Down Expand Up @@ -169,12 +178,55 @@ entry:
ret void
}
```

As you can see, the `subhandler` steals the kernel's function name, and receives two pointer arguments: the first one points to the kernel arguments from the SYCL runtime, and the second one to the `__nativecpu_state` struct.

## Handling barriers

On SYCL Native CPU, calls to `__spirv_ControlBarrier` are handled using the `WorkItemLoopsPass` from the oneAPI Construction Kit. This pass handles barriers by splitting the kernel between calls calls to `__spirv_ControlBarrier`, and creating a wrapper that runs the subkernels over the local range. In order to correctly interface to the oneAPI Construction Kit pass pipeline, SPIRV builtins are converted to `mux` builtins (used by the OCK) by the `ConvertToMuxBuiltinsSYCLNativeCPUPass`.

## Vectorization

The OneAPI Construction Kit's Whole Function Vectorizer is executed as an LLVM Pass. Considering the following input function:

```llvm
define void @SimpleVadd(i32*, i32*, i32*) {
%5 = call i64 @_Z13get_global_idj(i32 0)
%6 = getelementptr inbounds i32, ptr %1, i64 %5
%7 = load i32, ptr %6, align 4
%8 = getelementptr inbounds i32, ptr %2, i64 %5
%9 = load i32, ptr %8, align 4
%10 = add nsw i32 %9, %7
%11 = getelementptr inbounds i32, ptr %0, i64 %5
store i32 %10, ptr %11, align 4
ret void
}
```

With a vector width of 8, the vectorizer will produce:

```llvm
define void @__vecz_v8_SimpleVadd(i32*, i32*, i32*) !codeplay_ca_vecz.derived !2 {
%5 = call i64 @_Z13get_global_idj(i32 0)
%6 = getelementptr inbounds i32, ptr %1, i64 %5
%7 = load <8 x i32>, ptr %6, align 4
%8 = getelementptr inbounds i32, ptr %2, i64 %5
%9 = load <8 x i32>, ptr %8, align 4
%10 = add nsw <8 x i32> %9, %7
%11 = getelementptr inbounds i32, ptr %0, i64 %5
store <8 x i32> %12, ptr %11, align 4
ret void
}
!1 = !{i32 8, i32 0, i32 0, i32 0}
!2 = !{!1, ptr @_ZTSN4sycl3_V16detail19__pf_kernel_wrapperI10SimpleVaddEE}
```

The `__vecz_v8_SimpleVadd` function is the vectorized version of the original function. It receives arguments of the same type,
and has the `codeplay_ca_vecz.derived` metadata node attached. The metadata node contains information about the vectorization width,
and points to the original version of the function. This information is used later in the pass pipeline by the `WorkItemLoopsPass`,
which will account for the vectorization when creating the Work Item Loops, and use the original version of the function to add
peeling loops.

## Kernel registration

In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the SYCL Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array.
Expand All @@ -189,7 +241,7 @@ In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied
BinaryStart BinaryEnd
```

Each entry in the array contains the kernel name as a string, and a pointer to the `sunhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module.
Each entry in the array contains the kernel name as a string, and a pointer to the `subhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module.

## Kernel lowering and execution

Expand Down
16 changes: 8 additions & 8 deletions sycl/plugins/native_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,12 @@ if(NATIVECPU_USE_OCK)
include(FetchContent)
FetchContent_Declare(oneapi-ck
GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git
# commit 558b76cd6080107a2d65f7e37468f3d01eef80f7
# Merge: a5ec372f 542ee121
# Author: PietroGhg <38155419+PietroGhg@users.noreply.github.com>
# Date: Tue Jan 30 12:53:39 2024 +0100
# Merge pull request #322 from PietroGhg/pietro/update_30_jan
# Update sycl_native_experimental branch
GIT_TAG 558b76cd6080107a2d65f7e37468f3d01eef80f7
# commit 63f4ba99fc758ffc4268a807b21816b6be1b1b68
# Author: PietroGhg <38155419+PietroGhg@users.noreply.github.com>
# Date: Wed Jan 31 14:06:36 2024 +0100
# Merge pull request #326 from PietroGhg/pietro/vecz
# Integrate vecz in experimental branch
GIT_TAG 63f4ba99fc758ffc4268a807b21816b6be1b1b68
)
FetchContent_GetProperties(oneapi-ck)
if(NOT oneapi-ck_POPULATED)
Expand All @@ -48,8 +47,9 @@ if(NATIVECPU_USE_OCK)
target_include_directories(LLVMSYCLLowerIR PRIVATE
${oneapi-ck_SOURCE_DIR}/modules/compiler/multi_llvm/include
${oneapi-ck_SOURCE_DIR}/modules/cargo/include
${oneapi-ck_SOURCE_DIR}/modules/compiler/vecz/include
${oneapi-ck_SOURCE_DIR}/modules/compiler/utils/include)
target_link_libraries(LLVMSYCLLowerIR PRIVATE compiler-pipeline)
target_link_libraries(LLVMSYCLLowerIR PRIVATE compiler-pipeline vecz)
target_compile_definitions(pi_native_cpu PRIVATE NATIVECPU_USE_OCK)

endif()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ int main() {
sycl::range<1> r(1);
deviceQueue.submit([&](sycl::handler &h) {
h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
// CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
// CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}, ptr {{.*}}, ptr addrspace(1){{.*}})
// CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
// CHECK-NOT: @llvm.threadlocal

Expand Down
24 changes: 24 additions & 0 deletions sycl/test/check_device_code/native_cpu/vectorization.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// REQUIRES: native_cpu_be
// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm -o %t_temp.ll %s
// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DEFAULT
// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16
// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=4 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-4
// RUN: %clangxx -O0 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-O0
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-no-vecz -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DISABLE
#include <sycl/sycl.hpp>
class Test1;
int main() {
sycl::queue deviceQueue;
sycl::accessor<int, 1, sycl::access::mode::write> acc;
sycl::range<1> r(1);
deviceQueue.submit([&](sycl::handler &h) {
h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
// CHECK-DEFAULT: store <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
// CHECK-16: store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
// CHECK-4: store <4 x i32> <i32 42, i32 42, i32 42, i32 42>
// CHECK-O0: store i32 42
// CHECK-O0-NOT: store <{{.*}}>
// CHECK-DISABLE: store i32 42
// CHECK-DISABLE-NOT: store <{{.*}}>
});
}
Loading

0 comments on commit 330ac57

Please sign in to comment.