From a623d24791fb4387cdc1118bd525c8a4aa4ed638 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Fri, 20 Oct 2023 11:42:00 +0100
Subject: [PATCH 01/43] Support barriers on Native CPU

---
 clang/lib/CodeGen/BackendUtil.cpp             |  18 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   2 +-
 clang/test/CodeGenSYCL/native_cpu_basic.cpp   |   4 +-
 .../ConvertToMuxBuiltinsSYCLNativeCPU.h       |  30 ++
 .../llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h  |   5 +
 llvm/include/llvm/SYCLLowerIR/SYCLUtils.h     |   4 +-
 llvm/lib/SYCLLowerIR/CMakeLists.txt           |   2 +
 .../ConvertToMuxBuiltinsSYCLNativeCPU.cpp     | 183 ++++++++
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp |  31 ++
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 185 ++++----
 sycl/doc/design/SYCLNativeCPU.md              |  51 ++-
 sycl/include/sycl/detail/native_cpu.hpp       |  34 +-
 sycl/plugins/native_cpu/CMakeLists.txt        |  39 ++
 sycl/plugins/unified_runtime/CMakeLists.txt   |  12 +
 .../ur/adapters/native_cpu/enqueue.cpp        |  55 ++-
 .../native_cpu/kernelhandler-scalar.cpp       |  15 +-
 .../native_cpu/native_cpu_builtins.cpp        |  32 +-
 .../native_cpu/native_cpu_subhandler.cpp      |  11 +-
 sycl/test/native_cpu/barrier-external.cpp     |  46 ++
 sycl/test/native_cpu/barrier-simple.cpp       |  35 ++
 sycl/test/native_cpu/local-id-range.cpp       |   2 +-
 sycl/test/native_cpu/matrix-multiply.cpp      | 397 ++++++++++++++++++
 sycl/test/native_cpu/sycl-external-static.cpp |  69 +++
 sycl/test/native_cpu/sycl-external.cpp        |   7 -
 24 files changed, 1080 insertions(+), 189 deletions(-)
 create mode 100644 llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h
 create mode 100644 llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
 create mode 100644 llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
 create mode 100644 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
 create mode 100644 sycl/test/native_cpu/barrier-external.cpp
 create mode 100644 sycl/test/native_cpu/barrier-simple.cpp
 create mode 100644 sycl/test/native_cpu/matrix-multiply.cpp
 create mode 100644 sycl/test/native_cpu/sycl-external-static.cpp
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 81eee4ab4e2c0..f6ddf420a30f6 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -49,9 +49,8 @@
 #include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h"
 #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
 #include "llvm/SYCLLowerIR/MutatePrintfAddrspace.h"
-#include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
-#include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
+#include "llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CommandLine.h"
@@ -112,9 +111,9 @@ static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
     cl::desc("Insert sanitizers on OptimizerEarlyEP."), cl::init(false));
 
-static cl::opt<bool> SYCLNativeCPURename(
-    "sycl-native-cpu-rename", cl::init(false),
-    cl::desc("Rename kernel functions for SYCL Native CPU"));
+static cl::opt<bool> SYCLNativeCPUBackend(
+    "sycl-native-cpu-backend", cl::init(false),
+    cl::desc("Run the backend passes for SYCL Native CPU"));
 }
 
 namespace {
@@ -1094,8 +1093,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       MPM.addPass(PB.buildPerModuleDefaultPipeline(Level));
     }
 
-    if (SYCLNativeCPURename)
-      MPM.addPass(RenameKernelSYCLNativeCPUPass());
+    if (SYCLNativeCPUBackend) {
+      addSYCLNativeCPUBackendPasses(MPM, MAM);
+    }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
       if (LangOpts.EnableDAEInSpirKernels)
@@ -1125,10 +1125,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       // Process properties and annotations
       MPM.addPass(CompileTimePropertiesPass());
 
-      if (LangOpts.SYCLIsNativeCPU) {
-        MPM.addPass(PrepareSYCLNativeCPUPass());
-      }
-
       // Remove SYCL metadata added by the frontend, like sycl_aspects
       // Note, this pass should be at the end of the pipeline
       MPM.addPass(CleanupSYCLMetadataPass());
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c402e3967592d..e3e553040658f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5549,7 +5549,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
     if (IsSYCLOffloadDevice && IsSYCLNativeCPU) {
       CmdArgs.push_back("-mllvm");
-      CmdArgs.push_back("-sycl-native-cpu-rename");
+      CmdArgs.push_back("-sycl-native-cpu-backend");
     }
 
     // Also ignore explicit -force_cpusubtype_ALL option.
diff --git a/clang/test/CodeGenSYCL/native_cpu_basic.cpp b/clang/test/CodeGenSYCL/native_cpu_basic.cpp
index 87552837cea4a..0fa10a431cb42 100644
--- a/clang/test/CodeGenSYCL/native_cpu_basic.cpp
+++ b/clang/test/CodeGenSYCL/native_cpu_basic.cpp
@@ -50,8 +50,8 @@ void gen() {
 }
 
 // Check name mangling 
-// CHECK-DAG: @_ZTS6init_aIiE.NativeCPUKernel({{.*}})
-// CHECK-DAG: @_ZTS6init_aIfE.NativeCPUKernel({{.*}})
+// CHECK-DAG: @_ZTS6init_aIiE({{.*}})
+// CHECK-DAG: @_ZTS6init_aIfE({{.*}})
 
 // Check Native CPU module flag
 // CHECK-DAG: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1}
diff --git a/llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h
new file mode 100644
index 0000000000000..c45c985cd0af5
--- /dev/null
+++ b/llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h
@@ -0,0 +1,30 @@
+//===---- ConvertToMuxBuiltinsSYCLNativeCPU.h - Convert to Mux Builtins ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Converts SPIRV builtins to Mux builtins used by the oneAPI Construction 
+// Kit for SYCL Native CPU
+//
+//===----------------------------------------------------------------------===//
+
+
+#pragma once
+
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ModulePass;
+
+class ConvertToMuxBuiltinsSYCLNativeCPUPass
+    : public PassInfoMixin<ConvertToMuxBuiltinsSYCLNativeCPUPass> {
+public:
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+};
+
+} // namespace llvm
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h b/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
new file mode 100644
index 0000000000000..c28c056cd1f3a
--- /dev/null
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
@@ -0,0 +1,5 @@
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+void addSYCLNativeCPUBackendPasses(ModulePassManager& MPM, ModuleAnalysisManager& MAM);
+} // namespace llvm
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
index 40450d291509c..e4fc8660c90cf 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
@@ -118,10 +118,12 @@ inline bool isSYCLExternalFunction(const Function *F) {
 }
 
 constexpr char SYCLNATIVECPUSUFFIX[] = ".SYCLNCPU";
+constexpr char SYCLNATIVECPUKERNEL[] = ".NativeCPUKernel";
 inline llvm::Twine addSYCLNativeCPUSuffix(StringRef S) {
+  if(S.startswith("__dpcpp_nativecpu") || S.endswith(SYCLNATIVECPUKERNEL))
+    return S;
   return llvm::Twine(S, SYCLNATIVECPUSUFFIX);
 }
-constexpr char SYCLNATIVECPURENAMEMD[] = "sycl-native-cpu-rename";
 
 } // namespace utils
 } // namespace sycl
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index b02efc582663a..91e78a8e437af 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -72,6 +72,8 @@ add_llvm_component_library(LLVMSYCLLowerIR
   TargetHelpers.cpp
   PrepareSYCLNativeCPU.cpp
   RenameKernelSYCLNativeCPU.cpp
+  ConvertToMuxBuiltinsSYCLNativeCPU.cpp
+  PipelineSYCLNativeCPU.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR
diff --git a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
new file mode 100644
index 0000000000000..563baba03c1a6
--- /dev/null
+++ b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
@@ -0,0 +1,183 @@
+//===-- ConvertToMuxBuiltinsSYCLNativeCPU.cpp - Convert to Mux Builtins ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Converts SPIRV builtins to Mux builtins used by the oneAPI Construction
+// Kit for SYCL Native CPU
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/TargetParser/Triple.h"
+#include <map>
+
+using namespace llvm;
+
+namespace {
+
+// Helper macros for constructing builtin MS names
+#define GENMS1(builtin_str) "?" builtin_str "@@YA_KXZ"
+
+#define GEN_IT_proc(b_str, len) "_Z" #len b_str "v"
+#define GEN_p(b_str, len, ncpu_bstr, num)                                      \
+  {                                                                            \
+    {([]() { static_assert(sizeof(b_str) == len + 1); },                       \
+      GEN_IT_proc(b_str, len)),                                                \
+     GENMS1(b_str)},                                                           \
+    {                                                                          \
+      ncpu_bstr, num                                                           \
+    }                                                                          \
+  }
+#define GEN_xyz(b_name, len, ncpu_name)                                        \
+  GEN_p(#b_name "_x", len, #ncpu_name, 0),                                     \
+      GEN_p(#b_name "_y", len, #ncpu_name, 1),                                 \
+      GEN_p(#b_name "_z", len, #ncpu_name, 2)
+
+// Todo: add support for more SPIRV builtins here
+static const std::pair<std::pair<StringRef, StringRef>,
+                       std::pair<StringRef, unsigned int>>
+    BuiltinNamesMap[] = {
+        GEN_xyz(__spirv_GlobalInvocationId, 28, __mux_get_global_id),
+        GEN_xyz(__spirv_GlobalSize, 20, __mux_get_global_size),
+        GEN_xyz(__spirv_GlobalOffset, 22, __mux_get_global_offset),
+        GEN_xyz(__spirv_LocalInvocationId, 27, __mux_get_local_id),
+        GEN_xyz(__spirv_NumWorkgroups, 23, __mux_get_num_groups),
+        GEN_xyz(__spirv_WorkgroupSize, 23, __mux_get_local_size),
+        GEN_xyz(__spirv_WorkgroupId, 21, __mux_get_group_id),
+};
+
+static inline bool isForVisualStudio(StringRef TripleStr) {
+  llvm::Triple Triple(TripleStr);
+  return Triple.isKnownWindowsMSVCEnvironment();
+}
+
+static constexpr char SPIRVBarrier[] = "_Z22__spirv_ControlBarrierjjj";
+static constexpr char MuxBarrier[] = "__mux_work_group_barrier";
+
+Function *getReplaceFunc(Module &M, StringRef Name) {
+  LLVMContext &Ctx = M.getContext();
+  static auto *MuxFTy =
+      FunctionType::get(Type::getInt64Ty(Ctx), {Type::getInt32Ty(Ctx)}, false);
+  auto F = M.getOrInsertFunction(Name, MuxFTy);
+  return cast<Function>(F.getCallee());
+}
+
+Function *getMuxBarrierFunc(Module &M) {
+  // void __mux_work_group_barrier(i32 %id, i32 %scope, i32 %semantics)
+  LLVMContext &Ctx = M.getContext();
+  auto *Int32Ty = Type::getInt32Ty(Ctx);
+  static auto *MuxFTy = FunctionType::get(Type::getVoidTy(Ctx),
+                                          {Int32Ty, Int32Ty, Int32Ty}, false);
+  auto F = M.getOrInsertFunction(MuxBarrier, MuxFTy);
+  return cast<Function>(F.getCallee());
+}
+
+static constexpr const char *MuxKernelAttrName = "mux-kernel";
+
+void setIsKernelEntryPt(Function &F) {
+  F.addFnAttr(MuxKernelAttrName, "entry-point");
+}
+
+bool replaceBarriers(Module &M) {
+  // DPC++ emits
+  //__spirv_ControlBarrier(__spv::Scope Execution, __spv::Scope Memory,
+  //                       uint32_t Semantics) noexcept;
+  // OCK expects  void __mux_work_group_barrier(i32 %id, i32 %scope, i32
+  // %semantics)
+  // __spv::Scope is
+  // enum Flag : uint32_t {
+  //   CrossDevice = 0,
+  //   Device = 1,
+  //   Workgroup = 2,
+  //   Subgroup = 3,
+  //   Invocation = 4,
+  // };
+  auto *SPIRVBarrierFunc = M.getFunction(SPIRVBarrier);
+  if (!SPIRVBarrierFunc) {
+    // No barriers are found, just return
+    return false;
+  }
+  static auto *MuxBarrierFunc = getMuxBarrierFunc(M);
+  SmallVector<std::pair<Instruction *, Instruction*>> ToRemove;
+  auto *Zero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+  for (auto &Use : SPIRVBarrierFunc->uses()) {
+    auto *I = dyn_cast<CallInst>(Use.getUser());
+    if (!I)
+      report_fatal_error("Unsupported Value in SYCL Native CPU\n");
+    SmallVector<Value *, 3> Args{Zero, I->getArgOperand(0),
+                                 I->getArgOperand(2)}; // todo: check how the
+                                                       // args map to each other
+    auto *NewI = CallInst::Create(MuxBarrierFunc->getFunctionType(),
+                                  MuxBarrierFunc, Args, "", I);
+    ToRemove.push_back(std::pair(I, NewI));
+  }
+
+  for (auto &El : ToRemove) {
+    auto OldI = El.first;
+    auto NewI = El.second;
+    OldI->replaceAllUsesWith(NewI);
+    OldI->eraseFromParent();
+  }
+
+  SPIRVBarrierFunc->eraseFromParent();
+
+  return true;
+}
+
+} // namespace
+
+PreservedAnalyses
+ConvertToMuxBuiltinsSYCLNativeCPUPass::run(Module &M,
+                                           ModuleAnalysisManager &MAM) {
+  bool ModuleChanged = false;
+  for (auto &F : M) {
+    if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
+      setIsKernelEntryPt(F);
+  }
+  const bool VisualStudioMangling = isForVisualStudio(M.getTargetTriple());
+
+  // Then we iterate over all the supported builtins, find their uses and
+  // replace them with calls to our Native CPU functions.
+  for (auto &Entry : BuiltinNamesMap) {
+    auto *Glob = M.getFunction(VisualStudioMangling ? Entry.first.second
+                                                    : Entry.first.first);
+    if (!Glob)
+      continue;
+    auto *ReplaceFunc = getReplaceFunc(M, Entry.second.first);
+    SmallVector<std::pair<Instruction *, Instruction *>> ToRemove;
+    for (auto &Use : Glob->uses()) {
+      auto *I = dyn_cast<CallInst>(Use.getUser());
+      if (!I)
+        report_fatal_error("Unsupported Value in SYCL Native CPU\n");
+      auto *Arg = ConstantInt::get(Type::getInt32Ty(M.getContext()),
+                                   Entry.second.second);
+      auto *NewI = CallInst::Create(ReplaceFunc->getFunctionType(), ReplaceFunc,
+                                    {Arg}, "mux_call", I);
+      ModuleChanged = true;
+      ToRemove.push_back(std::make_pair(I, NewI));
+    }
+
+    for (auto &El : ToRemove) {
+      auto OldI = El.first;
+      auto NewI = El.second;
+      OldI->replaceAllUsesWith(NewI);
+      OldI->eraseFromParent();
+    }
+
+    // Finally, we erase the builtin from the module
+    Glob->eraseFromParent();
+  }
+
+  ModuleChanged |= replaceBarriers(M);
+  return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
new file mode 100644
index 0000000000000..769b8fc862bf6
--- /dev/null
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -0,0 +1,31 @@
+#include "llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h"
+#include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
+
+#ifdef NATIVECPU_USE_OCK
+#include "compiler/utils/builtin_info.h"
+#include "compiler/utils/sub_group_analysis.h"
+#include "compiler/utils/work_item_loops_pass.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#endif
+
+namespace llvm {
+void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
+                                   ModuleAnalysisManager &MAM) {
+  MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
+#ifdef NATIVECPU_USE_OCK
+  // Todo set options properly
+  compiler::utils::WorkItemLoopsPassOptions Opts;
+  Opts.IsDebug = false;
+  Opts.ForceNoTail = false;
+  MAM.registerPass([&] { return compiler::utils::BuiltinInfoAnalysis(); });
+  MAM.registerPass([&] { return compiler::utils::SubgroupAnalysis(); });
+  MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
+  MPM.addPass(AlwaysInlinerPass());
+
+#endif
+  MPM.addPass(PrepareSYCLNativeCPUPass());
+  MPM.addPass(RenameKernelSYCLNativeCPUPass());
+}
+} // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index dcfd0340447f0..c860d3371f19c 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
 
@@ -37,7 +38,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <functional>
@@ -46,6 +46,10 @@
 #include <utility>
 #include <vector>
 
+#ifdef NATIVECPU_USE_OCK
+#include "compiler/utils/builtin_info.h"
+#endif
+
 using namespace llvm;
 
 namespace {
@@ -67,42 +71,6 @@ void fixCallingConv(Function *F) {
   F->addFnAttr("frame-pointer", "none");
 }
 
-// returns the indexes of the used arguments
-SmallVector<unsigned> getUsedIndexes(const Function *F, bool useTLS) {
-  SmallVector<unsigned> Res;
-  auto UsedNode = F->getMetadata("sycl_kernel_omit_args");
-  if (!UsedNode) {
-    // the metadata node is not available if -fenable-sycl-dae
-    // was not set; set everything to true
-    // Exclude one arg because we already added the state ptr
-    const unsigned first = useTLS ? 0 : 1;
-    for (unsigned I = 0, NumP = F->getFunctionType()->getNumParams();
-         I + first < NumP; I++) {
-      Res.push_back(I);
-    }
-    return Res;
-  }
-  auto NumOperands = UsedNode->getNumOperands();
-  for (unsigned I = 0; I < NumOperands; I++) {
-    auto &Op = UsedNode->getOperand(I);
-    if (auto CAM = dyn_cast<ConstantAsMetadata>(Op.get())) {
-      if (auto Const = dyn_cast<ConstantInt>(CAM->getValue())) {
-        auto Val = Const->getValue();
-        if (!Val.getBoolValue()) {
-          Res.push_back(I);
-        }
-      } else {
-        report_fatal_error("Unable to retrieve constant int from "
-                           "sycl_kernel_omit_args metadata node");
-      }
-    } else {
-      report_fatal_error(
-          "Error while processing sycl_kernel_omit_args metadata node");
-    }
-  }
-  return Res;
-}
-
 void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType,
                             Type *StatePtrType, llvm::Constant *StateArgTLS) {
   LLVMContext &Ctx = F->getContext();
@@ -115,7 +83,7 @@ void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType,
   // subhandler steals its name, we add a suffix to the subhandler later
   // on when lowering the device module
   std::string OldName = F->getName().str();
-  auto NewName = Twine(OldName) + ".NativeCPUKernel";
+  auto NewName = Twine(OldName) + sycl::utils::SYCLNATIVECPUKERNEL;
   const StringRef SubHandlerName = OldName;
   F->setName(NewName);
   FunctionType *FTy = FunctionType::get(
@@ -124,22 +92,19 @@ void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType,
   Function *SubhF = cast<Function>(SubhFCallee.getCallee());
 
   // Emit function body, unpack kernel args
-  auto UsedIndexes = getUsedIndexes(F, StateArgTLS);
   auto *KernelTy = F->getFunctionType();
-  // assert(UsedIndexes.size() + 1 == KernelTy->getNumParams() && "mismatch
-  // between number of params and used args");
   IRBuilder<> Builder(Ctx);
   BasicBlock *Block = BasicBlock::Create(Ctx, "entry", SubhF);
   Builder.SetInsertPoint(Block);
-  unsigned NumArgs = UsedIndexes.size();
+  unsigned NumArgs = F->getFunctionType()->getNumParams();
   auto *BaseNativeCPUArg = SubhF->getArg(0);
   SmallVector<Value *, 5> KernelArgs;
-  for (unsigned I = 0; I < NumArgs; I++) {
+  const unsigned Inc = StateArgTLS == nullptr ? 1 : 0;
+  for (unsigned I = 0; I + Inc < NumArgs; I++) {
     auto *Arg = F->getArg(I);
-    auto UsedI = UsedIndexes[I];
     // Load the correct NativeCPUDesc and load the pointer from it
     auto *Addr = Builder.CreateGEP(NativeCPUArgDescType, BaseNativeCPUArg,
-                                   {Builder.getInt64(UsedI)});
+                                   {Builder.getInt64(I)});
     if (Arg->getType()->isPointerTy()) {
       // If the arg is a pointer, just use it
       auto *Load = Builder.CreateLoad(Arg->getType(), Addr);
@@ -209,41 +174,19 @@ Function *cloneFunctionAndAddParam(Function *OldF, Type *T,
   return NewF;
 }
 
-// Helper macros for constructing builtin MS names
-#define GENMS1(builtin_str) "?" builtin_str "@@YA_KXZ"
-
-#define GEN_IT_proc(b_str, len) "_Z" #len b_str "v"
-#define GEN_p(b_str, len, ncpu_bstr, num)                                      \
-  {                                                                            \
-    {([]() { static_assert(sizeof(b_str) == len + 1); },                       \
-      GEN_IT_proc(b_str, len)),                                                \
-     GENMS1(b_str)},                                                           \
-    {                                                                          \
-      ncpu_bstr, num                                                           \
-    }                                                                          \
-  }
-#define GEN_xyz(b_name, len, ncpu_name)                                        \
-  GEN_p(#b_name "_x", len, #ncpu_name, 0),                                     \
-      GEN_p(#b_name "_y", len, #ncpu_name, 1),                                 \
-      GEN_p(#b_name "_z", len, #ncpu_name, 2)
-
-// Todo: add support for more SPIRV builtins here
-static const std::pair<std::pair<StringRef, StringRef>,
-                       std::pair<StringRef, unsigned int>>
-    BuiltinNamesMap[] = {
-        GEN_xyz(__spirv_GlobalInvocationId, 28, __dpcpp_nativecpu_global_id),
-        GEN_xyz(__spirv_GlobalSize, 20, __dpcpp_nativecpu_global_range),
-        GEN_xyz(__spirv_GlobalOffset, 22, __dpcpp_nativecpu_get_global_offset),
-        GEN_xyz(__spirv_LocalInvocationId, 27, __dpcpp_nativecpu_get_local_id),
-        GEN_xyz(__spirv_NumWorkgroups, 23, __dpcpp_nativecpu_get_num_groups),
-        GEN_xyz(__spirv_WorkgroupSize, 23, __dpcpp_nativecpu_get_wg_size),
-        GEN_xyz(__spirv_WorkgroupId, 21, __dpcpp_nativecpu_get_wg_id),
-};
-
-static inline bool IsForVisualStudio(StringRef triple_str) {
-  llvm::Triple triple(triple_str);
-  return triple.isKnownWindowsMSVCEnvironment();
-}
+static const std::pair<StringRef, StringRef> BuiltinNamesMap[]{
+    {"__mux_get_global_id", "__dpcpp_nativecpu_get_global_id"},
+    {"__mux_get_global_size", "__dpcpp_nativecpu_get_global_range"},
+    {"__mux_get_global_offset", "__dpcpp_nativecpu_get_global_offset"},
+    {"__mux_get_local_id", "__dpcpp_nativecpu_get_local_id"},
+    {"__mux_get_num_groups", "__dpcpp_nativecpu_get_num_groups"},
+    {"__mux_get_local_size", "__dpcpp_nativecpu_get_wg_size"},
+    {"__mux_get_group_id", "__dpcpp_nativecpu_get_wg_id"},
+    {"__mux_set_num_sub_groups", "__dpcpp_nativecpu_set_num_sub_groups"},
+    {"__mux_set_sub_group_id", "__dpcpp_nativecpu_set_sub_group_id"},
+    {"__mux_set_max_sub_group_size",
+     "__dpcpp_nativecpu_set_max_sub_group_size"},
+    {"__mux_set_local_id", "__dpcpp_nativecpu_set_local_id"}};
 
 static Function *getReplaceFunc(const Module &M, StringRef Name) {
   Function *F = M.getFunction(Name);
@@ -261,6 +204,14 @@ static Value *getStateArg(Function *F, llvm::Constant *StateTLS) {
   return F->getArg(FT->getNumParams() - 1);
 }
 
+void fixUpKernelNameAfterBarrier(Function &F) {
+  Attribute Attr = F.getFnAttribute("mux-base-fn-name");
+  if (Attr.isValid()) {
+    auto Name = Attr.getValueAsString();
+    F.setName(Name);
+  }
+}
+
 static inline bool IsNativeCPUKernel(const Function *F) {
   return F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL;
 }
@@ -287,15 +238,11 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
   Type *StatePtrType = PointerType::get(StateType, 1);
 
   CurrentStatePointerTLS = nullptr;
-  const bool VisualStudioMangling = IsForVisualStudio(M.getTargetTriple());
 
   // Then we iterate over all the supported builtins, find the used ones
-  llvm::SmallVector<
-      std::pair<llvm::Function *, const std::pair<StringRef, unsigned int> &>>
-      UsedBuiltins;
+  llvm::SmallVector<std::pair<llvm::Function *, StringRef>> UsedBuiltins;
   for (const auto &Entry : BuiltinNamesMap) {
-    auto *Glob = M.getFunction(VisualStudioMangling ? Entry.first.second
-                                                    : Entry.first.first);
+    auto *Glob = M.getFunction(Entry.first);
     if (!Glob)
       continue;
     for (const auto &Use : Glob->uses()) {
@@ -328,9 +275,13 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
 
   SmallVector<Function *> NewKernels;
   for (auto &OldF : OldKernels) {
+#ifdef NATIVECPU_USE_OCK
+    fixUpKernelNameAfterBarrier(*OldF);
+#endif
     auto *NewF =
         cloneFunctionAndAddParam(OldF, StatePtrType, CurrentStatePointerTLS);
     NewF->takeName(OldF);
+    OldF->replaceAllUsesWith(NewF);
     OldF->eraseFromParent();
     NewKernels.push_back(NewF);
     ModuleChanged = true;
@@ -346,31 +297,65 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
   // Then we iterate over all used builtins and
   // replace them with calls to our Native CPU functions.
   for (const auto &Entry : UsedBuiltins) {
-    SmallVector<Instruction *> ToRemove;
+    SmallVector<std::pair<Instruction *, Instruction *>> ToRemove;
     Function *const Glob = Entry.first;
     for (const auto &Use : Glob->uses()) {
-      auto *ReplaceFunc = getReplaceFunc(M, Entry.second.first);
+      auto *ReplaceFunc = getReplaceFunc(M, Entry.second);
       auto I = dyn_cast<CallInst>(Use.getUser());
       if (!I)
         report_fatal_error("Unsupported Value in SYCL Native CPU\n");
-      auto *Arg = ConstantInt::get(Type::getInt32Ty(M.getContext()),
-                                   Entry.second.second);
-      auto *NewI = CallInst::Create(
-          ReplaceFunc->getFunctionType(), ReplaceFunc,
-          {Arg, getStateArg(I->getFunction(), CurrentStatePointerTLS)},
-          "ncpu_call", I);
-      if (I->getMetadata("dbg"))
-        NewI->setDebugLoc(I->getDebugLoc());
-      I->replaceAllUsesWith(NewI);
-      ToRemove.push_back(I);
+      SmallVector<Value *> Args(I->arg_begin(), I->arg_end());
+      Args.push_back(getStateArg(I->getFunction(), CurrentStatePointerTLS));
+      auto *NewI = CallInst::Create(ReplaceFunc->getFunctionType(), ReplaceFunc,
+                                    Args, "", I);
+      // If the parent function has debug info, we need to make sure that the
+      // CallInstructions in it have debug info, otherwise we end up with
+      // invalid IR after inlining.
+      if (I->getFunction()->hasMetadata("dbg")) {
+        I->setDebugLoc(DILocation::get(M.getContext(), 0, 0,
+                                       I->getFunction()->getSubprogram()));
+        if (I->getMetadata("dbg"))
+          NewI->setDebugLoc(I->getDebugLoc());
+      }
+      ToRemove.push_back(std::make_pair(I, NewI));
     }
 
-    for (auto &El : ToRemove)
-      El->eraseFromParent();
+    for (auto &El : ToRemove) {
+      auto OldI = El.first;
+      auto NewI = El.second;
+      OldI->replaceAllUsesWith(NewI);
+      OldI->eraseFromParent();
+    }
 
     // Finally, we erase the builtin from the module
     Glob->eraseFromParent();
   }
 
+#ifdef NATIVECPU_USE_OCK
+  // Define __mum_mem_barrier here using the OCK
+  compiler::utils::BuiltinInfo BI;
+  for (auto &F : M) {
+    if (F.getName() == compiler::utils::MuxBuiltins::mem_barrier) {
+      BI.defineMuxBuiltin(compiler::utils::BaseBuiltinID::eMuxBuiltinMemBarrier,
+                          M);
+    }
+  }
+  // if we find calls to mux barrier now, it means that we had SYCL_EXTERNAL
+  // functions that called __mux_work_group_barrier, which didn't get processed
+  // by the WorkItemLoop pass. This means that the actual function call has been
+  // inlined into the kernel, and the call to __mux_work_group_barrier has been
+  // removed in the inlined call, but not in the original function. The original
+  // function will not be executed (since it has been inlined) and so we can
+  // just define __mux_work_group_barrier as a no-op to avoid linker errors.
+  // Todo: currently we can't remove the function here even if it has no uses,
+  // because we may still emit a declaration for in the offload-wrapper.
+  auto BarrierF = M.getFunction(compiler::utils::MuxBuiltins::work_group_barrier);
+  if (BarrierF && BarrierF->isDeclaration()) {
+    IRBuilder<> Builder(M.getContext());
+    auto BB = BasicBlock::Create(M.getContext(), "noop", BarrierF);
+    Builder.SetInsertPoint(BB);
+    Builder.CreateRetVoid();
+  }
+#endif
   return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index 551efe0e30fbf..0f68b4de79657 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -27,11 +27,30 @@ clang++ <device-ir> -o <device-o>
 #link
 clang++ -L<sycl-lib-path> -lsycl <device-o> <host-o> -o <output>
 ```
-In order to execute kernels compiled for `native-cpu`, we provide a PI Plugin. The plugin needs to be enabled when configuring DPC++ (e.g. `python buildbot/configure.py --native_cpu`) and needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. 
+
+## Configuring DPC++ with SYCL Native CPU
+
+SYCL Native CPU needs to be enabled explictly when configuring DPC++, using `--native_cpu`, e.g.
+
+```
+python buildbot/configure.py \
+  --native_cpu
+# other options here
+```
+
+SYCL Native CPU uses the [oneAPI Construction Kit](https://github.com/codeplaysoftware/oneapi-construction-kit) (OCK) in order to support some core SYCL functionalities and improve performances, the OCK is fetched by default when Native CPU is enabled, and can optionally be disabled using the `NATIVE_CPU_USE_OCK` CMake variable (please note that disabling the OCK will result in limited functionalities and performances on the Native CPU backend):
+
+```
+python3 buildbot/configure.py \
+  --enable-plugin native_cpu \
+  --cmake-opt=-DNATIVE_CPU_USE_OCK=Off
+```
+
+The Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. 
 
 # Supported features and current limitations
 
-The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barrier` and several math builtins are not supported, and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](sycl/test/native_cpu).
+The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](sycl/test/native_cpu).
 
 
 To execute the `e2e` tests on the Native CPU, configure the test suite with:
@@ -50,7 +69,17 @@ cmake \
 
 Note that a number of `e2e` tests are currently still failing.
 
-# Running example
+## Ongoing work
+
+* Complete support for remaining SYCL features, including but not limited to
+  * math and other builtins
+* Vectorization (e.g. Whole Function Vectorization)
+* Subgroup support
+* Performance optimizations
+
+### Please note that Windows support is temporarily disabled due to some implementation details, it will be reinstantiated soon.
+
+# Technical details
 
 The following section gives a brief overview of how a simple SYCL application is compiled for the Native CPU target. Consider the following SYCL sample, which performs vector addition using USM:
 
@@ -142,6 +171,10 @@ entry:
 ```
 As you can see, the `subhandler` steals the kernel's function name, and receives two pointer arguments: the first one points to the kernel arguments from the SYCL runtime, and the second one to the `__nativecpu_state` struct.
 
+## Handling barriers 
+
+On SYCL Native CPU, calls to `__spirv_ControlBarrier` are handled using the `WorkItemLoopsPass` from the oneAPI Construction Kit. This pass handles barriers by splitting the kernel between calls calls to `__spirv_ControlBarrier`, and creating a wrapper that runs the subkernels over the local range. In order to correctly interface to the oneAPI Construction Kit pass pipeline, SPIRV builtins are converted to `mux` builtins (used by the OCK) by the `ConvertToMuxBuiltinsSYCLNativeCPUPass`.
+
 ## Kernel registration
 
 In order to register the Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array.
@@ -162,15 +195,3 @@ Each entry in the array contains the kernel name as a string, and a pointer to t
 
 The information produced by the device compiler is then employed to correctly lower the kernel LLVM-IR module to the target ISA (this is performed by the driver when `-fsycl-targets=native_cpu` is set). The object file containing the kernel code is linked with the host object file (and libsycl and any other needed library) and the final executable is ran using the Native CPU PI Plug-in, defined in [pi_native_cpu.cpp](sycl/plugins/native_cpu/pi_native_cpu.cpp).
 
-## Ongoing work
-
-* Complete support for remaining SYCL features, including but not limited to
-  * kernels with barriers
-  * math and other builtins
-  * work group local memory
-* Vectorization (e.g. Whole Function Vectorization)
-* Subgroup support
-* Performance optimizations
-* Support for multiple SYCL targets alongside native_cpu
-
-### Please note that Windows support is temporarily disabled due to some implementation details, it will be reinstantiated soon.
diff --git a/sycl/include/sycl/detail/native_cpu.hpp b/sycl/include/sycl/detail/native_cpu.hpp
index 8631d18c10ab4..76b2ee934cddc 100644
--- a/sycl/include/sycl/detail/native_cpu.hpp
+++ b/sycl/include/sycl/detail/native_cpu.hpp
@@ -51,6 +51,12 @@ struct __nativecpu_state {
     MLocal_id[2] = 0;
   }
 
+  void update(size_t group0, size_t group1, size_t group2) {
+    MWorkGroup_id[0] = group0;
+    MWorkGroup_id[1] = group1;
+    MWorkGroup_id[2] = group2;
+  }
+
   void update(size_t group0, size_t group1, size_t group2, size_t local0,
               size_t local1, size_t local2) {
     MWorkGroup_id[0] = group0;
@@ -74,12 +80,12 @@ struct __nativecpu_state {
   [[intel::device_indirectly_callable]]
 #define __NCPU_ATTRS extern "C" __SYCL_HC_ATTRS
 
-__NCPU_ATTRS size_t __dpcpp_nativecpu_global_id(
+__NCPU_ATTRS size_t __dpcpp_nativecpu_get_global_id(
     unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
   return s->MGlobal_id[dim];
 }
 
-__NCPU_ATTRS size_t __dpcpp_nativecpu_global_range(
+__NCPU_ATTRS size_t __dpcpp_nativecpu_get_global_range(
     unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
   return s->MGlobal_range[dim];
 }
@@ -108,6 +114,30 @@ __NCPU_ATTRS size_t __dpcpp_nativecpu_get_global_offset(
     unsigned dim, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
   return s->MGlobalOffset[dim];
 }
+
+__NCPU_ATTRS void __dpcpp_nativecpu_set_local_id(
+    unsigned dim, size_t value,  __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
+  s->MLocal_id[dim] = value;
+    s->MGlobal_id[dim] =
+        s->MWorkGroup_size[dim] * s->MWorkGroup_id[dim] + s->MLocal_id[dim] + s->MGlobalOffset[dim];
+}
+
+__NCPU_ATTRS void __dpcpp_nativecpu_set_num_sub_groups(
+    unsigned value, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
+  //Todo
+}
+
+__NCPU_ATTRS void __dpcpp_nativecpu_set_sub_group_id (
+    unsigned value, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
+  //Todo
+}
+
+
+__NCPU_ATTRS void __dpcpp_nativecpu_set_max_sub_group_size(
+    unsigned value, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
+  //Todo
+}
+
 #undef __SYCL_NCPU_GLOBAL_AS
 #undef __SYCL_HC_ATTRS
 #undef __NCPU_ATTRS
diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
index 5e858cfd911d0..39b5a6b93d957 100644
--- a/sycl/plugins/native_cpu/CMakeLists.txt
+++ b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -36,3 +36,42 @@ add_sycl_plugin(native_cpu
     sycl
     UnifiedRuntime-Headers
 )
+
+if(NOT DEFINED NATIVECPU_USE_OCK)
+  option(Native_CPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" On)
+endif()
+
+if(Native_CPU_USE_OCK)
+  # TODO: I'm not sure why we need this here, but we get errors when configuring without it 
+  if(NOT CMAKE_CLC_COMPILE_OBJECT)
+    set(CMAKE_CLC_COMPILE_OBJECT
+      "<CMAKE_CLC_COMPILER> <DEFINES> <INCLUDES> <FLAGS> -o <OBJECT> -c <SOURCE> -emit-llvm")
+  endif()
+  if(NOT CMAKE_CLC_CREATE_STATIC_LIBRARY)
+    set(CMAKE_CLC_CREATE_STATIC_LIBRARY
+      "<CMAKE_CLC_ARCHIVE> -o <TARGET> <OBJECTS>")
+  endif()
+  set(CMAKE_INCLUDE_FLAG_CLC "-I")
+
+  include(FetchContent)
+  FetchContent_Declare(oneapi-ck
+   GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git
+   GIT_TAG        sycl_native_experimental
+  )
+  FetchContent_GetProperties(oneapi-ck)
+  if(NOT oneapi-ck_POPULATED)
+    message("Cloning oneAPI Construction Kit")
+    FetchContent_Populate(oneapi-ck)
+    message("oneAPI Construction Kit cloned in ${oneapi-ck_SOURCE_DIR}")
+    set(CA_NATIVE_CPU 1)
+    add_subdirectory(${oneapi-ck_SOURCE_DIR} ${oneapi-ck_BINARY_DIR})
+  endif()
+  target_compile_definitions(LLVMSYCLLowerIR PRIVATE  NATIVECPU_USE_OCK)
+  target_include_directories(LLVMSYCLLowerIR PRIVATE 
+    ${oneapi-ck_SOURCE_DIR}/modules/compiler/multi_llvm/include
+    ${oneapi-ck_SOURCE_DIR}/modules/cargo/include
+    ${oneapi-ck_SOURCE_DIR}/modules/compiler/utils/include)
+  target_link_libraries(LLVMSYCLLowerIR PRIVATE compiler-utils)
+  target_compile_definitions(pi_native_cpu PRIVATE NATIVECPU_USE_OCK)
+
+endif()
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index e04e77dd6b4bb..18cc9d3c6b578 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -200,6 +200,18 @@ if("native_cpu" IN_LIST SYCL_ENABLE_PLUGINS)
       OpenCL-Headers
   )
 
+  if(NOT DEFINED NATIVECPU_USE_OCK)
+    option(Native_CPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" On)
+  endif()
+
+  if(Native_CPU_USE_OCK)
+    message("Compiling Native CPU adapter with OCK support.")
+    target_compile_definitions(ur_adapter_native_cpu PRIVATE NATIVECPU_USE_OCK)
+  else()
+    message("Compiling Native CPU adapter without OCK support.
+    Some valid SYCL programs may not build or may have low performance.")
+  endif()
+
   set_target_properties("ur_adapter_native_cpu" PROPERTIES
     VERSION "0.0.0"
     SOVERSION "0"
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
index 2a36caa312875..e69e585ffd20e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
@@ -41,6 +41,37 @@ sycl::detail::NDRDescT getNDRDesc(uint32_t WorkDim,
   return Res;
 }
 
+static void runWorkGroupLoops(const sycl::detail::NDRDescT& ndr, ur_kernel_handle_t hKernel) {
+
+  __nativecpu_state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
+                          ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
+                          ndr.LocalSize[2], ndr.GlobalOffset[0],
+                          ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
+
+  auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
+  auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
+  auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
+  for (unsigned g2 = 0; g2 < numWG2; g2++) {
+    for (unsigned g1 = 0; g1 < numWG1; g1++) {
+      for (unsigned g0 = 0; g0 < numWG0; g0++) {
+#ifdef NATIVECPU_USE_OCK
+         state.update(g0, g1, g2);
+         hKernel->_subhandler(hKernel->_args.data(), &state);
+#else
+         for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
+          for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
+            for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
+              state.update(g0, g1, g2, local0, local1, local2);
+              hKernel->_subhandler(hKernel->_args.data(), &state);
+            }
+          }
+        }
+#endif
+      }
+    }
+  }
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -66,28 +97,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       getNDRDesc(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize);
   hKernel->handleLocalArgs();
 
-  __nativecpu_state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
-                          ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
-                          ndr.LocalSize[2], ndr.GlobalOffset[0],
-                          ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
-
-  auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
-  auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
-  auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
-  for (unsigned g2 = 0; g2 < numWG2; g2++) {
-    for (unsigned g1 = 0; g1 < numWG1; g1++) {
-      for (unsigned g0 = 0; g0 < numWG0; g0++) {
-        for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
-          for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
-            for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
-              state.update(g0, g1, g2, local0, local1, local2);
-              hKernel->_subhandler(hKernel->_args.data(), &state);
-            }
-          }
-        }
-      }
-    }
-  }
+  runWorkGroupLoops(ndr, hKernel);
+  
   // TODO: we should avoid calling clear here by avoiding using push_back
   // in setKernelArgs.
   hKernel->_args.clear();
diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp
index a112c9a2e35ec..f26a966803948 100644
--- a/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp
+++ b/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp
@@ -1,8 +1,5 @@
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -fsycl-int-header=%t.h -S -o %t.ll %s
-// RUN: FileCheck -input-file=%t.ll %s --check-prefix=CHECK-LL
-// Compiling generated main integration header to check correctness, -fsycl
-// option used to find required includes
-// RUN: %clangxx -fsycl -D __SYCL_NATIVE_CPU__ -c -x c++ %t.h
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 #include <sycl/sycl.hpp>
 
 #include <iostream>
@@ -50,7 +47,7 @@ int main() {
   return 0;
 }
 
-// CHECK-LL-DAG: @_ZTS6init_aIiE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
-// CHECK-LL-DAG: @_ZTS6init_aIjE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
-// CHECK-LL-DAG: @_ZTS6init_aIfE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, float {{.*}}%2, ptr {{.*}}%3){{.*}}
-// CHECK-LL-DAG: @_ZTS6init_aIdE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, double {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-DAG: @_ZTS6init_aIiE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-DAG: @_ZTS6init_aIjE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-DAG: @_ZTS6init_aIfE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, float {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-DAG: @_ZTS6init_aIdE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, double {{.*}}%2, ptr {{.*}}%3){{.*}}
diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
index 20e1ef875c4b9..ece1728d170b0 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
@@ -1,8 +1,14 @@
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o - %s | FileCheck %s
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o - %s | FileCheck --check-prefix=CHECK-TL %s
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm %s -o - | FileCheck %s
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -fno-inline -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-TL %s
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s 
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
+
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm %s -o %t_temp.ll
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
+
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -fno-inline -S -emit-llvm %s -o %t_temp.ll
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
 
 // check that we added the state struct as a function argument, and that we
 // inject the calls to our builtins. We disable index flipping for SYCL Native
@@ -20,7 +26,7 @@ int main() {
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
     // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
-    // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr addrspace(1) %2)
+    // CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
     // CHECK-NOT: @llvm.threadlocal
 
     // CHECK-TL:      %[[VAL1:.*]] = call ptr addrspace(1) @llvm.threadlocal.address.p1(ptr addrspace(1) @_ZL28nativecpu_thread_local_state)
@@ -38,20 +44,20 @@ int main() {
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test2>(r2, [=](sycl::id<2> id) { acc[id[1]] = 42; });
     // CHECK: @_ZTS5Test2.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
-    // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 1, ptr addrspace(1) %2)
-    // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr addrspace(1) %2)
+    // CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 1, ptr addrspace(1) %2)
+    // CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
   });
   sycl::nd_range<3> r3({1, 1, 1}, {1, 1, 1});
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test3>(
         r3, [=](sycl::item<3> item) { acc[item[2]] = item.get_range(0); });
     // CHECK: @_ZTS5Test3.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
-    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 2, ptr addrspace(1) %2)
-    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 1, ptr addrspace(1) %2)
-    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(i32 0, ptr addrspace(1) %2)
-    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 2, ptr addrspace(1) %2)
-    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 1, ptr addrspace(1) %2)
-    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(i32 0, ptr addrspace(1) %2)
+    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_range(i32 2, ptr addrspace(1) %2)
+    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_range(i32 1, ptr addrspace(1) %2)
+    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_range(i32 0, ptr addrspace(1) %2)
+    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 2, ptr addrspace(1) %2)
+    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 1, ptr addrspace(1) %2)
+    // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
   });
 
   const size_t dim = 2;
diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
index 4f5f9f1c1fe1d..b4c3bf9a271df 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
@@ -1,4 +1,5 @@
-// RUN: %clangxx -fsycl-device-only  -O2 -g -fexceptions -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o - %s | FileCheck %s
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
 // Checks that the subhandler is correctly emitted in the module
 #include <sycl/sycl.hpp>
@@ -32,7 +33,7 @@ __attribute__((sycl_kernel)) void launch(const Func &kernelFunc) {
 void test() {
   queue q;
   gen_test<int>(q);
-  //CHECK:  define void @_ZTS6init_aIiE(ptr %{{.*}}, ptr addrspace(1) {{.*}}) #{{.*}} {
+  //CHECK:  define void @_ZTS6init_aIiE.SYCLNCPU(ptr %{{.*}}, ptr addrspace(1) {{.*}}) #{{.*}} {
   //CHECK:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
   //CHECK:  %{{.*}} = load ptr addrspace(1), ptr %{{.*}}
   //CHECK:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
@@ -44,7 +45,7 @@ void test() {
   //CHECK:  ret void
   //CHECK:}
   gen_test<float>(q);
-  //CHECK:  define void @_ZTS6init_aIfE(ptr %{{.*}}, ptr addrspace(1) {{.*}}) #{{.*}} {
+  //CHECK:  define void @_ZTS6init_aIfE.SYCLNCPU(ptr %{{.*}}, ptr addrspace(1) {{.*}}) #{{.*}} {
   //CHECK:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
   //CHECK:  %{{.*}} = load ptr addrspace(1), ptr %{{.*}}
   //CHECK:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
@@ -64,13 +65,13 @@ void test() {
       acc[id[0]]; // all kernel arguments are removed
     });
   });
-  //CHECK:define void @_ZTS5Test1(ptr %{{.*}}, ptr addrspace(1) %[[STATE2:.*]]) #{{.*}} {
+  //CHECK:define void @_ZTS5Test1.SYCLNCPU(ptr %{{.*}}, ptr addrspace(1) %[[STATE2:.*]]) #{{.*}} {
   //CHECK:       call void @_ZTS5Test1.NativeCPUKernel(ptr addrspace(1) %[[STATE2]])
   //CHECK-NEXT:  ret void
   //CHECK-NEXT:}
 
   launch<class TestKernel>([]() {});
-  //CHECK:define void @_ZTSZ4testvE10TestKernel(ptr %{{.*}}, ptr addrspace(1) %[[STATE3:.*]]) #{{.*}} {
+  //CHECK:define void @_ZTSZ4testvE10TestKernel.SYCLNCPU(ptr %{{.*}}, ptr addrspace(1) %[[STATE3:.*]]) #{{.*}} {
   //CHECK:       call void @_ZTSZ4testvE10TestKernel.NativeCPUKernel(ptr addrspace(1) %[[STATE3]])
   //CHECK-NEXT:  ret void
   //CHECK-NEXT:}
diff --git a/sycl/test/native_cpu/barrier-external.cpp b/sycl/test/native_cpu/barrier-external.cpp
new file mode 100644
index 0000000000000..966642a01c41f
--- /dev/null
+++ b/sycl/test/native_cpu/barrier-external.cpp
@@ -0,0 +1,46 @@
+// REQUIRES: native_cpu_be
+// RUN: %clangxx -DFILE1 -fsycl -fsycl-targets=native_cpu %s -g -c -o %t1.o
+// RUN: %clangxx -DFILE2 -fsycl -fsycl-targets=native_cpu %s -g -c -o %t2.o
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %t1.o %t2.o -g -o %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+#ifdef FILE1
+SYCL_EXTERNAL void call_barrier(nd_item<1>& it) {
+  it.barrier(access::fence_space::local_space);
+}
+#endif
+
+#ifdef FILE2
+SYCL_EXTERNAL void call_barrier(nd_item<1>& it);
+class Test;
+constexpr sycl::access::mode sycl_write = sycl::access::mode::write;
+int main() {
+  queue q;
+  constexpr unsigned N = 10;
+  constexpr unsigned NumG = 2;
+  range<1> localR{N};
+  range<1> globalR{NumG * N};
+  buffer<int, 1> Buffer(globalR);
+  q.submit([&](handler& h) {
+      auto acc = Buffer.get_access<sycl_write>(h);
+      h.parallel_for<Test>(nd_range<1>{globalR, localR}, [=](nd_item<1> it) {
+            acc[it.get_global_id(0)] = 0;
+            call_barrier(it);
+            acc[it.get_global_id(0)]++;
+            });
+      
+      });
+  sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
+  for(unsigned i = 0; i < N * NumG; i++) {
+    if(HostAccessor[i] != 1){
+      std::cout << "Error\n";
+      return 1;
+    }
+  }
+  std::cout << "Test passed\n";
+  return 0;
+}
+#endif
diff --git a/sycl/test/native_cpu/barrier-simple.cpp b/sycl/test/native_cpu/barrier-simple.cpp
new file mode 100644
index 0000000000000..d3bbcc63111b2
--- /dev/null
+++ b/sycl/test/native_cpu/barrier-simple.cpp
@@ -0,0 +1,35 @@
+// REQUIRES: native_cpu_be
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+class Test;
+constexpr sycl::access::mode sycl_write = sycl::access::mode::write;
+int main() {
+  queue q;
+  constexpr unsigned N = 10;
+  constexpr unsigned NumG = 2;
+  range<1> localR{N};
+  range<1> globalR{NumG * N};
+  buffer<int, 1> Buffer(globalR);
+  q.submit([&](handler& h) {
+      auto acc = Buffer.get_access<sycl_write>(h);
+      h.parallel_for<Test>(nd_range<1>{globalR, localR}, [=](nd_item<1> it) {
+            acc[it.get_global_id(0)] = 0;
+            it.barrier(access::fence_space::local_space);
+            acc[it.get_global_id(0)]++;
+            });
+      
+      });
+  sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
+  for(unsigned i = 0; i < N * NumG; i++) {
+    if(HostAccessor[i] != 1){
+      std::cout << "Error\n";
+      return 1;
+    }
+  }
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/native_cpu/local-id-range.cpp b/sycl/test/native_cpu/local-id-range.cpp
index c77eae7d723ab..6d9bf982780cc 100644
--- a/sycl/test/native_cpu/local-id-range.cpp
+++ b/sycl/test/native_cpu/local-id-range.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: native_cpu_be
-// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -mllvm -inline-threshold=500 %s -o %t
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
 
 #include <clocale>
diff --git a/sycl/test/native_cpu/matrix-multiply.cpp b/sycl/test/native_cpu/matrix-multiply.cpp
new file mode 100644
index 0000000000000..520fd0292d5fe
--- /dev/null
+++ b/sycl/test/native_cpu/matrix-multiply.cpp
@@ -0,0 +1,397 @@
+// REQUIRES: native_cpu_be
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t 128 sycl
+//
+/***************************************************************************
+ *
+ *  Copyright (C) 2016 Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  Codeplay's ComputeCpp SDK
+ *
+ *  matrix-multiply.cpp
+ *
+ *  Description:
+ *    Example of matrix multiplication in SYCL.
+ *
+ **************************************************************************/
+
+/*  This example compares an OpenMP blocked matrix multiplication
+ *  implementation with a SYCL blocked matrix multiplication example.
+ *  The purpose is not to compare performance, but to show the similarities
+ *  and differences between them.
+ *  See block_host for the OpenMP implementation. */
+
+#include <CL/sycl.hpp>
+
+#include <chrono>
+#include <cmath>
+#include <ctime>
+#include <iostream>
+
+using namespace cl::sycl;
+
+class mxm_kernel;
+
+void display_matrix(float* m, int matSize) {
+  if (matSize > 16) {
+    return;
+  }
+
+  std::cout << "=======" << std::endl;
+  for (int i = 0; i < matSize; i++) {
+    for (int j = 0; j < matSize; j++) {
+      std::cout << m[i * matSize + j] << " ";
+    }
+    std::cout << std::endl;
+  }
+  std::cout << "=======" << std::endl;
+  ;
+}
+
+/* Implements a host C++ version of the matrix multiplication.
+ * If compiler supports OpenMP, code is parallelized. Scheduling
+ * uses static chunks of block_size. */
+void block_host(float* MA, float* MB, float* MC, int matSize) {
+  /* We set the block size to 32 for simplicity, though the optimal
+   * value will depend on the platform this is run on. */
+  int block_size = 32;
+  int numBlocks = block_size / matSize;
+  int extraBlockLength = block_size % matSize;
+  numBlocks = extraBlockLength ? (numBlocks + 1) : (numBlocks);
+
+#pragma omp parallel for collapse(2)
+  for (int bIndexI = 0; bIndexI < matSize; bIndexI += block_size)
+    for (int bIndexJ = 0; bIndexJ < matSize; bIndexJ += block_size)
+      for (int bIndexK = 0; bIndexK < matSize; bIndexK += block_size) {
+        int i = bIndexI;
+        int j = bIndexJ;
+        int k = bIndexK;
+        for (int bi = i; bi < std::min(i + block_size, matSize); bi++)
+          for (int bj = j; bj < std::min(j + block_size, matSize); bj++)
+            for (int bk = k; bk < std::min(k + block_size, matSize); bk++) {
+              MC[bi * matSize + bj] +=
+                  MA[bi * matSize + bk] * MB[bk * matSize + bj];
+            }
+      }
+}
+
+/* Obtains the previous power of two from the given integer.
+ * It works by masking out all ones after the first one bit,
+ * then leaves the first one bit intact, effectively
+ * yielding the first power of two < x. */
+inline int prevPowerOfTwo(int x) {
+  if (x < 0) {
+    return 0;
+  }
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return x - (x >> 1);
+}
+
+/* Checks if X is a power of two.
+ * If there are bits sets to one after AND with the
+ * previous number, then it is not a power of two.
+ */
+inline bool isPowerOfTwo(int x) { return (x & (x - 1)) == 0; }
+
+/* Function template that performs the matrix * matrix operation. (It is
+ * a template because only some OpenCL devices support double-precision
+ * floating-point numbers, but it is interesting to make the comparison
+ * where available.)
+ * Broadly, the function chooses an appropriate work size, then enqueues
+ * the matrix * matrix lambda on the queue provided. Because the queues
+ * are constructed inside this function, it will block until the work is
+ * finished.
+ * Note that this example only works for powers of two.
+ * */
+template <typename T>
+bool local_mxm(cl::sycl::queue& q, T* MA, T* MB, T* MC, int matSize) {
+  // Make sure it is power of two before running
+  if (!isPowerOfTwo(matSize)) {
+    std::cout << " This example only works with power of two sizes "
+              << std::endl;
+    return true;
+  }
+
+  auto device = q.get_device();
+  auto maxBlockSize =
+      device.get_info<cl::sycl::info::device::max_work_group_size>();
+  auto blockSize = prevPowerOfTwo(std::sqrt(maxBlockSize));
+  std::cout << " The Device Max Work Group Size is : " << maxBlockSize
+            << std::endl;
+  std::cout << " The order is : " << matSize << std::endl;
+  std::cout << " The blockSize is : " << blockSize << std::endl;
+  // Make sure the block size is not larger than the mat size
+  blockSize = std::min(matSize, blockSize);
+
+  {
+    /* Buffers can be constructed with property lists. In this example,
+     * the buffer is given the property "use host pointer", which tells
+     * the runtime to use the host pointer for all data storage (instead
+     * of making copies internally). Additionally, when running on a
+     * device that shares memory with the host (for example a CPU),
+     * "zero-copy" memory optimisations can be used by the driver. */
+    range<1> dimensions(matSize * matSize);
+    const property_list props = {property::buffer::use_host_ptr()};
+    buffer<T> bA(MA, dimensions, props);
+    buffer<T> bB(MB, dimensions, props);
+    buffer<T> bC(MC, dimensions, props);
+
+    q.submit([&](handler& cgh) {
+      auto pA = bA.template get_access<access::mode::read>(cgh);
+      auto pB = bB.template get_access<access::mode::read>(cgh);
+      auto pC = bC.template get_access<access::mode::write>(cgh);
+      auto localRange = range<1>(blockSize * blockSize);
+
+      accessor<T, 1, access::mode::read_write, access::target::local> pBA(
+          localRange, cgh);
+      accessor<T, 1, access::mode::read_write, access::target::local> pBB(
+          localRange, cgh);
+
+      cgh.parallel_for<mxm_kernel>(
+          nd_range<2>{range<2>(matSize, matSize),
+                      range<2>(blockSize, blockSize)},
+          [=](nd_item<2> it) {
+            // Current block
+            int blockX = it.get_group(1);
+            int blockY = it.get_group(0);
+
+            // Current local item
+            int localX = it.get_local_id(1);
+            int localY = it.get_local_id(0);
+
+            // Start in the A matrix
+            int a_start = matSize * blockSize * blockY;
+            // End in the b matrix
+            int a_end = a_start + matSize - 1;
+            // Start in the b matrix
+            int b_start = blockSize * blockX;
+
+            // Result for the current C(i,j) element
+            T tmp = 0.0f;
+            // We go through all a, b blocks
+            for (int a = a_start, b = b_start; a <= a_end;
+                 a += blockSize, b += (blockSize * matSize)) {
+              // Copy the values in shared memory collectively
+              pBA[localY * blockSize + localX] =
+                  pA[a + matSize * localY + localX];
+              // Note the swap of X/Y to maintain contiguous access
+              pBB[localX * blockSize + localY] =
+                  pB[b + matSize * localY + localX];
+              it.barrier(access::fence_space::local_space);
+              // Now each thread adds the value of its sum
+              for (int k = 0; k < blockSize; k++) {
+                tmp +=
+                    pBA[localY * blockSize + k] * pBB[localX * blockSize + k];
+              }
+              // The barrier ensures that all threads have written to local
+              // memory before continuing
+              it.barrier(access::fence_space::local_space);
+            }
+            auto elemIndex = it.get_global_id(0) * it.get_global_range()[1] +
+                             it.get_global_id(1);
+            // Each thread updates its position
+            pC[elemIndex] = tmp;
+          });
+    });
+  }
+  return false;
+}
+
+/* Helper function to indicate the parameters the sample takes. */
+void usage(std::string programName) {
+  std::cout << " Incorrect number of parameters " << std::endl;
+  std::cout << " Usage: " << std::endl;
+  std::cout << programName << " [matrix size] [omp|sycl]" << std::endl;
+  std::cout << "[matrix size] : Size of the matrix to multiply (minimum 32)"
+            << std::endl;
+  std::cout << "[omp|sycl]    : Run the OpenMP or the SYCL variant. "
+            << " Default is to use both " << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  float* MA;
+  float* MB;
+  float* MC;
+  bool sycl = true;
+  bool omp = true;
+  bool error = false;
+
+  if (argc != 2 && argc != 3) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  int matSize = 0;
+  try {
+    matSize = std::stoi(argv[1]);
+  } catch (...) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  if (matSize < 32) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  if (argc == 3) {
+    if (std::string(argv[2]) == "omp") {
+      omp = true;
+      sycl = false;
+    } else if (std::string(argv[2]) == "sycl") {
+      omp = false;
+      sycl = true;
+    } else {
+      usage(argv[0]);
+    }
+  }
+
+  MA = new float[matSize * matSize];
+  MB = new float[matSize * matSize];
+  MC = new float[matSize * matSize];
+
+// Matrix initialization
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < matSize; i++)
+    for (int j = 0; j < matSize; j++) {
+      MA[i * matSize + j] = 0.0f;
+      if (i == j) {
+        MA[i * matSize + j] = 1.0f;
+      }
+      MB[i * matSize + j] = 2.0f;
+      MC[i * matSize + j] = 0.0f;  // i * matSize + j;
+    }
+
+  std::cout << " Input matrix " << std::endl;
+  display_matrix(MA, matSize);
+  display_matrix(MB, matSize);
+  display_matrix(MC, matSize);
+
+  if (omp) {
+#if defined(_OPENMP)
+    std::cout << "OpenMP: ";
+#else
+    std::cout << "C++: ";
+#endif
+
+    {
+      auto start = std::chrono::steady_clock::now();
+      block_host(MA, MB, MC, matSize);
+      auto end = std::chrono::steady_clock::now();
+      auto time =
+          std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+              .count();
+      std::cout << "Time: " << time << std::endl;
+      float flops =
+          (2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
+      std::cout << "GFLOPs: " << flops << std::endl;
+
+      bool error = false;
+      // Testing
+      for (int i = 0; i < matSize; i++)
+        for (int j = 0; j < matSize; j++) {
+          if (std::fabs(MC[i * matSize + j] - MB[i * matSize + j]) > 1e-8) {
+            std::cout << " Position " << i << ", " << j
+                      << " differs: " << MC[i * matSize + j]
+                      << " != " << MB[i * matSize + j] << std::endl;
+            error = true;
+          }
+        }
+      if (!error) {
+        std::cout << "Success" << std::endl;
+      } else {
+        std::cout << " Error in the computation " << std::endl;
+      }
+    }
+  }
+
+  if (sycl) {
+    std::cout << " ***** SYCL " << std::endl;
+    // Matrix initialization
+    for (int i = 0; i < matSize; i++)
+      for (int j = 0; j < matSize; j++) {
+        MC[i * matSize + j] = 0.0f;  // i * matSize + j;
+      }
+
+    {
+      {
+        /* Create the SYCL queue - note that we add an async handler function
+         * to capture potential asynchronous errors. This function will be
+         * called every time there is an asynchronous error on the queue (i.e.
+         * some error occurs while the queue is executing kernels) and one of
+         * cl::sycl::queue::throw() or cl::sycl::queue::wait_and_throw() is
+         * called. */
+        queue q([&](exception_list eL) {
+          try {
+            for (auto& e : eL) {
+              std::rethrow_exception(e);
+            }
+          } catch (cl::sycl::exception e) {
+            std::cout << " An exception has been thrown: " << e.what()
+                      << std::endl;
+          }
+        });
+
+        auto start = std::chrono::steady_clock::now();
+        error = local_mxm(q, MA, MB, MC, matSize);
+        q.wait_and_throw();
+        auto end = std::chrono::steady_clock::now();
+        auto time =
+            std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+                .count();
+        std::cout << "SYCL: ";
+        std::cout << "Time: " << time << std::endl;
+        float flops =
+            (2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
+        std::cout << "GFLOPs: " << flops << std::endl;
+        std::cout << " Output " << std::endl;
+      }
+
+      if (!error) {
+        display_matrix(MC, matSize);
+        error = false;
+        // Testing
+        for (int i = 0; i < matSize; i++)
+          for (int j = 0; j < matSize; j++) {
+            if (std::fabs(MC[i * matSize + j] - MB[i * matSize + j]) > 1e-8) {
+              std::cout << " Position " << i << ", " << j
+                        << " differs: " << MC[i * matSize + j]
+                        << " != " << MB[i * matSize + j] << std::endl;
+              error = true;
+            }
+          }
+        if (!error) {
+          std::cout << "Success" << std::endl;
+          ;
+        } else {
+          std::cout << " Error in the computation " << std::endl;
+        }
+      }
+    }
+  }
+
+  delete[] MA;
+  delete[] MB;
+  delete[] MC;
+
+  return error ? 1 : 0;
+}
diff --git a/sycl/test/native_cpu/sycl-external-static.cpp b/sycl/test/native_cpu/sycl-external-static.cpp
new file mode 100644
index 0000000000000..a8409cc1f3ac2
--- /dev/null
+++ b/sycl/test/native_cpu/sycl-external-static.cpp
@@ -0,0 +1,69 @@
+// REQUIRES: native_cpu_be
+// Check that kernel can call a SYCL_EXTERNAL function defined in a
+// static library.
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -DSOURCE1 %s -c -o %t1.o
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -DSOURCE2 %s -c -o %t2.o
+// RUN: rm -f %t.a
+// RUN: llvm-ar crv %t.a %t1.o
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %t2.o %t.a -o %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
+// This currently fails because we have a static array of functions which
+// we do not support when handling barriers.
+// XFAIL: *
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+#ifdef SOURCE1
+int bar(int b);
+
+SYCL_EXTERNAL
+int foo(int a) { return bar(a); }
+
+__attribute((noinline)) int bar(int b) {
+#ifdef __SYCL_DEVICE_ONLY__
+  return 1;
+#else
+  return 0;
+#endif
+}
+#endif // SOURCE1
+
+#ifdef SOURCE2
+SYCL_EXTERNAL
+int foo(int A);
+
+int main(void) {
+  constexpr unsigned Size = 4;
+  int A[Size] = {1, 2, 3, 4};
+  int B[Size] = {1, 2, 3, 4};
+  int C[Size];
+
+  {
+    sycl::range<1> range{Size};
+    sycl::buffer<int, 1> bufA(A, range);
+    sycl::buffer<int, 1> bufB(B, range);
+    sycl::buffer<int, 1> bufC(C, range);
+
+    sycl::queue().submit([&](sycl::handler &cgh) {
+      auto accA = bufA.get_access<sycl::access::mode::read>(cgh);
+      auto accB = bufB.get_access<sycl::access::mode::read>(cgh);
+      auto accC = bufC.get_access<sycl::access::mode::write>(cgh);
+
+      cgh.parallel_for<class Test>(
+          range, [=](sycl::id<1> ID) { accC[ID] = foo(accA[ID]); });
+    });
+  }
+
+  for (unsigned I = 0; I < Size; ++I) {
+    int Ref = foo(A[I]);
+    if (C[I] != 1) {
+      std::cout << "fail: [" << I << "] == " << C[I] << ", expected " << 1
+                << "\n";
+      return 1;
+    }
+  }
+  std::cout << "pass\n";
+  return 0;
+}
+#endif // SOURCE2
diff --git a/sycl/test/native_cpu/sycl-external.cpp b/sycl/test/native_cpu/sycl-external.cpp
index 040e1314e0971..daa7b952c51d8 100644
--- a/sycl/test/native_cpu/sycl-external.cpp
+++ b/sycl/test/native_cpu/sycl-external.cpp
@@ -5,13 +5,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=native_cpu -DSOURCE2 %s -c -o %t2.o
 // RUN: %clangxx -fsycl -fsycl-targets=native_cpu %t1.o %t2.o -o %t
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
-//
-// Test2 - check that kernel can call a SYCL_EXTERNAL function defined in a
-// static library.
-// RUN: rm -f %t.a
-// RUN: llvm-ar crv %t.a %t1.o
-// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %t2.o -foffload-static-lib=%t.a -o %t
-// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
 
 #include <iostream>
 #include <sycl/sycl.hpp>

From aed012f8fa2d137213f05f03974b01c247520e88 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Fri, 20 Oct 2023 12:09:16 +0100
Subject: [PATCH 02/43] formatting

---
 .../ConvertToMuxBuiltinsSYCLNativeCPU.h       |  3 +--
 .../llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h  |  3 ++-
 llvm/include/llvm/SYCLLowerIR/SYCLUtils.h     |  2 +-
 .../ConvertToMuxBuiltinsSYCLNativeCPU.cpp     |  2 +-
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp |  2 +-
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 11 ++++----
 sycl/include/sycl/detail/native_cpu.hpp       | 21 ++++++++--------
 .../ur/adapters/native_cpu/enqueue.cpp        | 11 ++++----
 .../native_cpu/native_cpu_builtins.cpp        |  2 +-
 sycl/test/native_cpu/barrier-external.cpp     | 25 +++++++++----------
 sycl/test/native_cpu/barrier-simple.cpp       | 21 ++++++++--------
 sycl/test/native_cpu/matrix-multiply.cpp      | 22 ++++++++--------
 12 files changed, 62 insertions(+), 63 deletions(-)

diff --git a/llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h
index c45c985cd0af5..290409fa2f0c0 100644
--- a/llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Converts SPIRV builtins to Mux builtins used by the oneAPI Construction 
+// Converts SPIRV builtins to Mux builtins used by the oneAPI Construction
 // Kit for SYCL Native CPU
 //
 //===----------------------------------------------------------------------===//
 
-
 #pragma once
 
 #include "llvm/IR/Module.h"
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h b/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
index c28c056cd1f3a..e87fb2a0b415b 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
@@ -1,5 +1,6 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-void addSYCLNativeCPUBackendPasses(ModulePassManager& MPM, ModuleAnalysisManager& MAM);
+void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
+                                   ModuleAnalysisManager &MAM);
 } // namespace llvm
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
index e4fc8660c90cf..0d4ffd36a6313 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
@@ -120,7 +120,7 @@ inline bool isSYCLExternalFunction(const Function *F) {
 constexpr char SYCLNATIVECPUSUFFIX[] = ".SYCLNCPU";
 constexpr char SYCLNATIVECPUKERNEL[] = ".NativeCPUKernel";
 inline llvm::Twine addSYCLNativeCPUSuffix(StringRef S) {
-  if(S.startswith("__dpcpp_nativecpu") || S.endswith(SYCLNATIVECPUKERNEL))
+  if (S.startswith("__dpcpp_nativecpu") || S.endswith(SYCLNATIVECPUKERNEL))
     return S;
   return llvm::Twine(S, SYCLNATIVECPUSUFFIX);
 }
diff --git a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
index 563baba03c1a6..1132f575492f7 100644
--- a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
@@ -108,7 +108,7 @@ bool replaceBarriers(Module &M) {
     return false;
   }
   static auto *MuxBarrierFunc = getMuxBarrierFunc(M);
-  SmallVector<std::pair<Instruction *, Instruction*>> ToRemove;
+  SmallVector<std::pair<Instruction *, Instruction *>> ToRemove;
   auto *Zero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
   for (auto &Use : SPIRVBarrierFunc->uses()) {
     auto *I = dyn_cast<CallInst>(Use.getUser());
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 769b8fc862bf6..0501d5c092947 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -1,7 +1,7 @@
-#include "llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h"
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h"
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index c860d3371f19c..88cb37d403223 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -48,6 +48,7 @@
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"
+#include "compiler/utils/attributes.h"
 #endif
 
 using namespace llvm;
@@ -205,11 +206,8 @@ static Value *getStateArg(Function *F, llvm::Constant *StateTLS) {
 }
 
 void fixUpKernelNameAfterBarrier(Function &F) {
-  Attribute Attr = F.getFnAttribute("mux-base-fn-name");
-  if (Attr.isValid()) {
-    auto Name = Attr.getValueAsString();
-    F.setName(Name);
-  }
+  auto Name = compiler::utils::getBaseFnNameOrFnName(F);
+  F.setName(Name);
 }
 
 static inline bool IsNativeCPUKernel(const Function *F) {
@@ -349,7 +347,8 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
   // just define __mux_work_group_barrier as a no-op to avoid linker errors.
   // Todo: currently we can't remove the function here even if it has no uses,
   // because we may still emit a declaration for in the offload-wrapper.
-  auto BarrierF = M.getFunction(compiler::utils::MuxBuiltins::work_group_barrier);
+  auto BarrierF =
+      M.getFunction(compiler::utils::MuxBuiltins::work_group_barrier);
   if (BarrierF && BarrierF->isDeclaration()) {
     IRBuilder<> Builder(M.getContext());
     auto BB = BasicBlock::Create(M.getContext(), "noop", BarrierF);
diff --git a/sycl/include/sycl/detail/native_cpu.hpp b/sycl/include/sycl/detail/native_cpu.hpp
index 76b2ee934cddc..f189dffdf22ea 100644
--- a/sycl/include/sycl/detail/native_cpu.hpp
+++ b/sycl/include/sycl/detail/native_cpu.hpp
@@ -115,27 +115,28 @@ __NCPU_ATTRS size_t __dpcpp_nativecpu_get_global_offset(
   return s->MGlobalOffset[dim];
 }
 
-__NCPU_ATTRS void __dpcpp_nativecpu_set_local_id(
-    unsigned dim, size_t value,  __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
+__NCPU_ATTRS void
+__dpcpp_nativecpu_set_local_id(unsigned dim, size_t value,
+                               __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
   s->MLocal_id[dim] = value;
-    s->MGlobal_id[dim] =
-        s->MWorkGroup_size[dim] * s->MWorkGroup_id[dim] + s->MLocal_id[dim] + s->MGlobalOffset[dim];
+  s->MGlobal_id[dim] = s->MWorkGroup_size[dim] * s->MWorkGroup_id[dim] +
+                       s->MLocal_id[dim] + s->MGlobalOffset[dim];
 }
 
 __NCPU_ATTRS void __dpcpp_nativecpu_set_num_sub_groups(
     unsigned value, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
-  //Todo
+  // Todo
 }
 
-__NCPU_ATTRS void __dpcpp_nativecpu_set_sub_group_id (
-    unsigned value, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
-  //Todo
+__NCPU_ATTRS void
+__dpcpp_nativecpu_set_sub_group_id(unsigned value,
+                                   __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
+  // Todo
 }
 
-
 __NCPU_ATTRS void __dpcpp_nativecpu_set_max_sub_group_size(
     unsigned value, __SYCL_NCPU_GLOBAL_AS __nativecpu_state *s) {
-  //Todo
+  // Todo
 }
 
 #undef __SYCL_NCPU_GLOBAL_AS
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
index e69e585ffd20e..e3a3cac2ea011 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
@@ -41,7 +41,8 @@ sycl::detail::NDRDescT getNDRDesc(uint32_t WorkDim,
   return Res;
 }
 
-static void runWorkGroupLoops(const sycl::detail::NDRDescT& ndr, ur_kernel_handle_t hKernel) {
+static void runWorkGroupLoops(const sycl::detail::NDRDescT &ndr,
+                              ur_kernel_handle_t hKernel) {
 
   __nativecpu_state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
                           ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
@@ -55,10 +56,10 @@ static void runWorkGroupLoops(const sycl::detail::NDRDescT& ndr, ur_kernel_handl
     for (unsigned g1 = 0; g1 < numWG1; g1++) {
       for (unsigned g0 = 0; g0 < numWG0; g0++) {
 #ifdef NATIVECPU_USE_OCK
-         state.update(g0, g1, g2);
-         hKernel->_subhandler(hKernel->_args.data(), &state);
+        state.update(g0, g1, g2);
+        hKernel->_subhandler(hKernel->_args.data(), &state);
 #else
-         for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
+        for (unsigned local2 = 0; local2 < ndr.LocalSize[2]; local2++) {
           for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
             for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
               state.update(g0, g1, g2, local0, local1, local2);
@@ -98,7 +99,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   hKernel->handleLocalArgs();
 
   runWorkGroupLoops(ndr, hKernel);
-  
+
   // TODO: we should avoid calling clear here by avoiding using push_back
   // in setKernelArgs.
   hKernel->_args.clear();
diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
index ece1728d170b0..398445ede08b2 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
 // RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s 
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s
 // RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
 
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm %s -o %t_temp.ll
diff --git a/sycl/test/native_cpu/barrier-external.cpp b/sycl/test/native_cpu/barrier-external.cpp
index 966642a01c41f..224e1babf43f4 100644
--- a/sycl/test/native_cpu/barrier-external.cpp
+++ b/sycl/test/native_cpu/barrier-external.cpp
@@ -8,13 +8,13 @@
 using namespace sycl;
 
 #ifdef FILE1
-SYCL_EXTERNAL void call_barrier(nd_item<1>& it) {
+SYCL_EXTERNAL void call_barrier(nd_item<1> &it) {
   it.barrier(access::fence_space::local_space);
 }
 #endif
 
 #ifdef FILE2
-SYCL_EXTERNAL void call_barrier(nd_item<1>& it);
+SYCL_EXTERNAL void call_barrier(nd_item<1> &it);
 class Test;
 constexpr sycl::access::mode sycl_write = sycl::access::mode::write;
 int main() {
@@ -24,18 +24,17 @@ int main() {
   range<1> localR{N};
   range<1> globalR{NumG * N};
   buffer<int, 1> Buffer(globalR);
-  q.submit([&](handler& h) {
-      auto acc = Buffer.get_access<sycl_write>(h);
-      h.parallel_for<Test>(nd_range<1>{globalR, localR}, [=](nd_item<1> it) {
-            acc[it.get_global_id(0)] = 0;
-            call_barrier(it);
-            acc[it.get_global_id(0)]++;
-            });
-      
-      });
+  q.submit([&](handler &h) {
+    auto acc = Buffer.get_access<sycl_write>(h);
+    h.parallel_for<Test>(nd_range<1>{globalR, localR}, [=](nd_item<1> it) {
+      acc[it.get_global_id(0)] = 0;
+      call_barrier(it);
+      acc[it.get_global_id(0)]++;
+    });
+  });
   sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
-  for(unsigned i = 0; i < N * NumG; i++) {
-    if(HostAccessor[i] != 1){
+  for (unsigned i = 0; i < N * NumG; i++) {
+    if (HostAccessor[i] != 1) {
       std::cout << "Error\n";
       return 1;
     }
diff --git a/sycl/test/native_cpu/barrier-simple.cpp b/sycl/test/native_cpu/barrier-simple.cpp
index d3bbcc63111b2..9d064712b3b0f 100644
--- a/sycl/test/native_cpu/barrier-simple.cpp
+++ b/sycl/test/native_cpu/barrier-simple.cpp
@@ -14,18 +14,17 @@ int main() {
   range<1> localR{N};
   range<1> globalR{NumG * N};
   buffer<int, 1> Buffer(globalR);
-  q.submit([&](handler& h) {
-      auto acc = Buffer.get_access<sycl_write>(h);
-      h.parallel_for<Test>(nd_range<1>{globalR, localR}, [=](nd_item<1> it) {
-            acc[it.get_global_id(0)] = 0;
-            it.barrier(access::fence_space::local_space);
-            acc[it.get_global_id(0)]++;
-            });
-      
-      });
+  q.submit([&](handler &h) {
+    auto acc = Buffer.get_access<sycl_write>(h);
+    h.parallel_for<Test>(nd_range<1>{globalR, localR}, [=](nd_item<1> it) {
+      acc[it.get_global_id(0)] = 0;
+      it.barrier(access::fence_space::local_space);
+      acc[it.get_global_id(0)]++;
+    });
+  });
   sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
-  for(unsigned i = 0; i < N * NumG; i++) {
-    if(HostAccessor[i] != 1){
+  for (unsigned i = 0; i < N * NumG; i++) {
+    if (HostAccessor[i] != 1) {
       std::cout << "Error\n";
       return 1;
     }
diff --git a/sycl/test/native_cpu/matrix-multiply.cpp b/sycl/test/native_cpu/matrix-multiply.cpp
index 520fd0292d5fe..3f21733620090 100644
--- a/sycl/test/native_cpu/matrix-multiply.cpp
+++ b/sycl/test/native_cpu/matrix-multiply.cpp
@@ -46,7 +46,7 @@ using namespace cl::sycl;
 
 class mxm_kernel;
 
-void display_matrix(float* m, int matSize) {
+void display_matrix(float *m, int matSize) {
   if (matSize > 16) {
     return;
   }
@@ -65,7 +65,7 @@ void display_matrix(float* m, int matSize) {
 /* Implements a host C++ version of the matrix multiplication.
  * If compiler supports OpenMP, code is parallelized. Scheduling
  * uses static chunks of block_size. */
-void block_host(float* MA, float* MB, float* MC, int matSize) {
+void block_host(float *MA, float *MB, float *MC, int matSize) {
   /* We set the block size to 32 for simplicity, though the optimal
    * value will depend on the platform this is run on. */
   int block_size = 32;
@@ -123,7 +123,7 @@ inline bool isPowerOfTwo(int x) { return (x & (x - 1)) == 0; }
  * Note that this example only works for powers of two.
  * */
 template <typename T>
-bool local_mxm(cl::sycl::queue& q, T* MA, T* MB, T* MC, int matSize) {
+bool local_mxm(cl::sycl::queue &q, T *MA, T *MB, T *MC, int matSize) {
   // Make sure it is power of two before running
   if (!isPowerOfTwo(matSize)) {
     std::cout << " This example only works with power of two sizes "
@@ -155,7 +155,7 @@ bool local_mxm(cl::sycl::queue& q, T* MA, T* MB, T* MC, int matSize) {
     buffer<T> bB(MB, dimensions, props);
     buffer<T> bC(MC, dimensions, props);
 
-    q.submit([&](handler& cgh) {
+    q.submit([&](handler &cgh) {
       auto pA = bA.template get_access<access::mode::read>(cgh);
       auto pB = bB.template get_access<access::mode::read>(cgh);
       auto pC = bC.template get_access<access::mode::write>(cgh);
@@ -227,10 +227,10 @@ void usage(std::string programName) {
             << " Default is to use both " << std::endl;
 }
 
-int main(int argc, char* argv[]) {
-  float* MA;
-  float* MB;
-  float* MC;
+int main(int argc, char *argv[]) {
+  float *MA;
+  float *MB;
+  float *MC;
   bool sycl = true;
   bool omp = true;
   bool error = false;
@@ -278,7 +278,7 @@ int main(int argc, char* argv[]) {
         MA[i * matSize + j] = 1.0f;
       }
       MB[i * matSize + j] = 2.0f;
-      MC[i * matSize + j] = 0.0f;  // i * matSize + j;
+      MC[i * matSize + j] = 0.0f; // i * matSize + j;
     }
 
   std::cout << " Input matrix " << std::endl;
@@ -329,7 +329,7 @@ int main(int argc, char* argv[]) {
     // Matrix initialization
     for (int i = 0; i < matSize; i++)
       for (int j = 0; j < matSize; j++) {
-        MC[i * matSize + j] = 0.0f;  // i * matSize + j;
+        MC[i * matSize + j] = 0.0f; // i * matSize + j;
       }
 
     {
@@ -342,7 +342,7 @@ int main(int argc, char* argv[]) {
          * called. */
         queue q([&](exception_list eL) {
           try {
-            for (auto& e : eL) {
+            for (auto &e : eL) {
               std::rethrow_exception(e);
             }
           } catch (cl::sycl::exception e) {

From 798e2d15556ac14c0ecf7d537765c4ebf8962197 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Fri, 20 Oct 2023 12:13:46 +0100
Subject: [PATCH 03/43] formatting

---
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index 88cb37d403223..d6a6c7af82881 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -47,8 +47,8 @@
 #include <vector>
 
 #ifdef NATIVECPU_USE_OCK
-#include "compiler/utils/builtin_info.h"
 #include "compiler/utils/attributes.h"
+#include "compiler/utils/builtin_info.h"
 #endif
 
 using namespace llvm;

From 4233cf8520d8b88cbe455635dc25ae3d5856a364 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Fri, 20 Oct 2023 12:27:54 +0100
Subject: [PATCH 04/43] Remove def for fixUpKernelNameAfterBarrier

---
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index d6a6c7af82881..1701412594b40 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -205,11 +205,6 @@ static Value *getStateArg(Function *F, llvm::Constant *StateTLS) {
   return F->getArg(FT->getNumParams() - 1);
 }
 
-void fixUpKernelNameAfterBarrier(Function &F) {
-  auto Name = compiler::utils::getBaseFnNameOrFnName(F);
-  F.setName(Name);
-}
-
 static inline bool IsNativeCPUKernel(const Function *F) {
   return F->getCallingConv() == llvm::CallingConv::SPIR_KERNEL;
 }
@@ -274,7 +269,8 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
   SmallVector<Function *> NewKernels;
   for (auto &OldF : OldKernels) {
 #ifdef NATIVECPU_USE_OCK
-    fixUpKernelNameAfterBarrier(*OldF);
+    auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
+    OldF->setName(Name);
 #endif
     auto *NewF =
         cloneFunctionAndAddParam(OldF, StatePtrType, CurrentStatePointerTLS);

From 5d76bd6bfe9af53bdf3f959681856423151c8013 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Mon, 23 Oct 2023 08:49:00 +0100
Subject: [PATCH 05/43] Licence header

---
 clang/lib/CodeGen/BackendUtil.cpp              |  2 +-
 .../llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h   | 18 ++++++++++++++++++
 .../llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h   |  6 ------
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 15 ++++++++++++++-
 4 files changed, 33 insertions(+), 8 deletions(-)
 create mode 100644 llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h
 delete mode 100644 llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index f6ddf420a30f6..e8d845528918b 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -50,7 +50,7 @@
 #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
 #include "llvm/SYCLLowerIR/MutatePrintfAddrspace.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
-#include "llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h"
+#include "llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h
new file mode 100644
index 0000000000000..a9a60c666ff1f
--- /dev/null
+++ b/llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h
@@ -0,0 +1,18 @@
+//===----- PipelineSYCLNativeCPU.h - Pass pipeline for SYCL Native CPU ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the pass pipeline used when lowering device code for SYCL Native 
+// CPU. 
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
+                                   ModuleAnalysisManager &MAM);
+} // namespace llvm
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h b/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
deleted file mode 100644
index e87fb2a0b415b..0000000000000
--- a/llvm/include/llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
-                                   ModuleAnalysisManager &MAM);
-} // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 0501d5c092947..64d6d9fec3b89 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -1,7 +1,20 @@
+//===---- PipelineSYCLNativeCPU.cpp - Pass pipeline for SYCL Native CPU ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the pass pipeline used when lowering device code for SYCL Native 
+// CPU. 
+// When NATIVECPU_USE_OCK is set, adds passes from the oneAPI Construction Kit.
+//
+//===----------------------------------------------------------------------===//
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
-#include "llvm/SYCLLowerIR/SYCLNativeCPUPipeline.h"
+#include "llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h"
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"

From 509447ce49e5e92f678af017250886dc01da26e2 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Mon, 23 Oct 2023 08:49:14 +0100
Subject: [PATCH 06/43] Update lit test

---
 clang/test/CodeGenSYCL/native_cpu_basic.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/clang/test/CodeGenSYCL/native_cpu_basic.cpp b/clang/test/CodeGenSYCL/native_cpu_basic.cpp
index 0fa10a431cb42..b442c9a6aa3b5 100644
--- a/clang/test/CodeGenSYCL/native_cpu_basic.cpp
+++ b/clang/test/CodeGenSYCL/native_cpu_basic.cpp
@@ -1,5 +1,4 @@
 // This test checks for some basic Front End features for Native CPU:
-// * Kernel name mangling
 // * is-native-cpu module flag
 // RUN: %clang_cc1 -fsycl-is-device -S -emit-llvm -internal-isystem %S/Inputs -fsycl-is-native-cpu -o %t.ll %s 
 // RUN: FileCheck -input-file=%t.ll %s 
@@ -49,9 +48,5 @@ void gen() {
   test<float>(q);
 }
 
-// Check name mangling 
-// CHECK-DAG: @_ZTS6init_aIiE({{.*}})
-// CHECK-DAG: @_ZTS6init_aIfE({{.*}})
-
 // Check Native CPU module flag
 // CHECK-DAG: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1}

From 9586b39ef39d1ae7b0cf9297328a12199e2c4492 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Mon, 23 Oct 2023 08:58:41 +0100
Subject: [PATCH 07/43] formatting

---
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 64d6d9fec3b89..0197814bd1f2e 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Defines the pass pipeline used when lowering device code for SYCL Native 
-// CPU. 
+// Defines the pass pipeline used when lowering device code for SYCL Native
+// CPU.
 // When NATIVECPU_USE_OCK is set, adds passes from the oneAPI Construction Kit.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
-#include "llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h"
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"

From c297e24f2090e79ffda0bf5f72fc54dec4085235 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Mon, 23 Oct 2023 09:02:25 +0100
Subject: [PATCH 08/43] formatting

---
 clang/lib/CodeGen/BackendUtil.cpp                     | 2 +-
 llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index e8d845528918b..ca68317e6dac3 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -49,8 +49,8 @@
 #include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h"
 #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
 #include "llvm/SYCLLowerIR/MutatePrintfAddrspace.h"
-#include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h
index a9a60c666ff1f..33691f71f869a 100644
--- a/llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/PipelineSYCLNativeCPU.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Defines the pass pipeline used when lowering device code for SYCL Native 
-// CPU. 
+// Defines the pass pipeline used when lowering device code for SYCL Native
+// CPU.
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Target/TargetMachine.h"

From 5a62505e00cd57f2854cc85f0179e2959db0f2ce Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Wed, 25 Oct 2023 11:56:03 +0100
Subject: [PATCH 09/43] formatting

---
 clang/lib/CodeGen/BackendUtil.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 23db5045e6da0..96ca3fdf0bf94 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -66,6 +66,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
@@ -92,7 +93,6 @@
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
-#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"

From 0b140ef853e34aedc12d2e775f3ee6fcacf59b74 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Tue, 31 Oct 2023 12:07:52 +0000
Subject: [PATCH 10/43] [wip] vecz integration

---
 .../ConvertToMuxBuiltinsSYCLNativeCPU.cpp     | 20 ++++++++-
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 32 ++++++++++++++-
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 41 ++++++++++++-------
 sycl/plugins/native_cpu/CMakeLists.txt        |  7 ++--
 .../ur/adapters/native_cpu/enqueue.cpp        | 10 +++++
 5 files changed, 90 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
index 1132f575492f7..b83e9a422cb83 100644
--- a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
@@ -25,6 +25,22 @@ using namespace llvm;
 
 namespace {
 
+static void fixCallingConv(Function *F) {
+  // The frame-pointer=all and the "byval" attributes lead to code generation
+  // that conflicts with the Kernel declaration that we emit in the Native CPU
+  // helper header (in which all the kernel argument are void* or scalars).
+  auto AttList = F->getAttributes();
+  for (unsigned ArgNo = 0; ArgNo < F->getFunctionType()->getNumParams();
+       ArgNo++) {
+    if (AttList.hasParamAttr(ArgNo, Attribute::AttrKind::ByVal)) {
+      AttList = AttList.removeParamAttribute(F->getContext(), ArgNo,
+                                             Attribute::AttrKind::ByVal);
+    }
+  }
+  F->setAttributes(AttList);
+  F->addFnAttr("frame-pointer", "none");
+}
+
 // Helper macros for constructing builtin MS names
 #define GENMS1(builtin_str) "?" builtin_str "@@YA_KXZ"
 
@@ -141,8 +157,10 @@ ConvertToMuxBuiltinsSYCLNativeCPUPass::run(Module &M,
                                            ModuleAnalysisManager &MAM) {
   bool ModuleChanged = false;
   for (auto &F : M) {
-    if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
+    if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
       setIsKernelEntryPt(F);
+      fixCallingConv(&F);
+    }
   }
   const bool VisualStudioMangling = isForVisualStudio(M.getTargetTriple());
 
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 0197814bd1f2e..71fa3e85bf329 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -15,19 +15,43 @@
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
+#include "llvm/Passes/PassBuilder.h"
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"
+#include "compiler/utils/device_info.h"
 #include "compiler/utils/sub_group_analysis.h"
 #include "compiler/utils/work_item_loops_pass.h"
+#include "vecz/pass.h"
+#include "vecz/vecz_target_info.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #endif
 
 namespace llvm {
+cl::opt<bool> NativeCPUVecz("ncpu-vecz", cl::init(false), cl::desc("Run vectorizer on SYCL Native CPU"));
+cl::opt<unsigned> NativeCPUVeczWidth("ncpu-vecz-width", cl::init(1), cl::desc("Vector width for SYCL Native CPU vectorizer"));
 void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
                                    ModuleAnalysisManager &MAM) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
+  if(NativeCPUVecz) {
+    MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
+    MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });
+    auto queryFunc =
+        [](llvm::Function &F, llvm::ModuleAnalysisManager &,
+           llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) -> bool {
+      if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
+        return false;
+      }
+      compiler::utils::VectorizationFactor VF(NativeCPUVeczWidth, false);
+      vecz::VeczPassOptions VPO;
+      VPO.factor = VF;
+      Opts.emplace_back(VPO);
+      return true;
+    };
+    MAM.registerPass([&] { return vecz::VeczPassOptionsAnalysis(queryFunc); });
+    MPM.addPass(vecz::RunVeczPass());
+  }
   // Todo set options properly
   compiler::utils::WorkItemLoopsPassOptions Opts;
   Opts.IsDebug = false;
@@ -36,9 +60,15 @@ void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
   MAM.registerPass([&] { return compiler::utils::SubgroupAnalysis(); });
   MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
   MPM.addPass(AlwaysInlinerPass());
-
 #endif
   MPM.addPass(PrepareSYCLNativeCPUPass());
   MPM.addPass(RenameKernelSYCLNativeCPUPass());
+
+  // Run optimization passes after all the changes we made to the kernels.
+  // Todo: check optimization level from clang
+  // Todo: maybe we could find a set of relevant passes instead of re-running the full 
+  // optimization pipeline.
+  PassBuilder PB;
+  MPM.addPass(PB.buildPerModuleDefaultPipeline(OptimizationLevel()));
 }
 } // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index 1701412594b40..0ace2f81d94a0 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -49,6 +49,7 @@
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/attributes.h"
 #include "compiler/utils/builtin_info.h"
+#include "compiler/utils/metadata.h"
 #endif
 
 using namespace llvm;
@@ -57,19 +58,6 @@ namespace {
 
 void fixCallingConv(Function *F) {
   F->setCallingConv(llvm::CallingConv::C);
-  // The frame-pointer=all and the "byval" attributes lead to code generation
-  // that conflicts with the Kernel declaration that we emit in the Native CPU
-  // helper header (in which all the kernel argument are void* or scalars).
-  auto AttList = F->getAttributes();
-  for (unsigned ArgNo = 0; ArgNo < F->getFunctionType()->getNumParams();
-       ArgNo++) {
-    if (AttList.hasParamAttr(ArgNo, Attribute::AttrKind::ByVal)) {
-      AttList = AttList.removeParamAttribute(F->getContext(), ArgNo,
-                                             Attribute::AttrKind::ByVal);
-    }
-  }
-  F->setAttributes(AttList);
-  F->addFnAttr("frame-pointer", "none");
 }
 
 void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType,
@@ -196,6 +184,8 @@ static Function *getReplaceFunc(const Module &M, StringRef Name) {
 }
 
 static Value *getStateArg(Function *F, llvm::Constant *StateTLS) {
+  // Todo: we should probably cache the state thread local load here 
+  // to avoid re-emitting it for each builtin
   if (StateTLS) {
     IRBuilder<> BB(&*F->getEntryBlock().getFirstInsertionPt());
     llvm::Value *V = BB.CreateThreadLocalAddress(StateTLS);
@@ -232,6 +222,18 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
 
   CurrentStatePointerTLS = nullptr;
 
+  // check if any of the kernels is called by some other function.
+  // This can happen e.g. with OCK, where wrapper functions are 
+  // created around the origianl kernel.
+  bool KernelIsCalled = false;
+  for(auto& K : OldKernels) {
+    for(auto& U : K->uses()){
+      if(isa<CallBase>(U.getUser())) {
+        KernelIsCalled = true;
+      }
+    }
+  }
+
   // Then we iterate over all the supported builtins, find the used ones
   llvm::SmallVector<std::pair<llvm::Function *, StringRef>> UsedBuiltins;
   for (const auto &Entry : BuiltinNamesMap) {
@@ -242,9 +244,9 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
       auto I = dyn_cast<CallInst>(Use.getUser());
       if (!I)
         report_fatal_error("Unsupported Value in SYCL Native CPU\n");
-      if (!IsNativeCPUKernel(I->getFunction())) {
+      if (!IsNativeCPUKernel(I->getFunction()) || KernelIsCalled) {
         // only use the threadlocal if we have kernels calling builtins
-        // indirectly
+        // indirectly, or if the kernel is called by some other func.
         if (CurrentStatePointerTLS == nullptr)
           CurrentStatePointerTLS = M.getOrInsertGlobal(
               STATE_TLS_NAME, StatePtrType, [&M, StatePtrType]() {
@@ -271,6 +273,15 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
 #ifdef NATIVECPU_USE_OCK
     auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
     OldF->setName(Name);
+    // if vectorization occurred, at this point we have a wrapper function that 
+    // runs the vectorized kernel and peels using the scalar kernel. We make it so
+    // this wrapper steals the original kernel name.
+    std::optional<compiler::utils::LinkMetadataResult> veczR = compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
+    if(veczR) {
+      auto ScalarF = veczR.value().first;
+      OldF->takeName(ScalarF);
+      ScalarF->setName(OldF->getName() + "_scalar");
+    }
 #endif
     auto *NewF =
         cloneFunctionAndAddParam(OldF, StatePtrType, CurrentStatePointerTLS);
diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
index 39b5a6b93d957..04fbb3bec51cd 100644
--- a/sycl/plugins/native_cpu/CMakeLists.txt
+++ b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -55,8 +55,8 @@ if(Native_CPU_USE_OCK)
 
   include(FetchContent)
   FetchContent_Declare(oneapi-ck
-   GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git
-   GIT_TAG        sycl_native_experimental
+   GIT_REPOSITORY https://github.com/PietroGhg/oneapi-construction-kit.git
+   GIT_TAG        pietro/vecz
   )
   FetchContent_GetProperties(oneapi-ck)
   if(NOT oneapi-ck_POPULATED)
@@ -70,8 +70,9 @@ if(Native_CPU_USE_OCK)
   target_include_directories(LLVMSYCLLowerIR PRIVATE 
     ${oneapi-ck_SOURCE_DIR}/modules/compiler/multi_llvm/include
     ${oneapi-ck_SOURCE_DIR}/modules/cargo/include
+    ${oneapi-ck_SOURCE_DIR}/modules/compiler/vecz/include
     ${oneapi-ck_SOURCE_DIR}/modules/compiler/utils/include)
-  target_link_libraries(LLVMSYCLLowerIR PRIVATE compiler-utils)
+  target_link_libraries(LLVMSYCLLowerIR PRIVATE compiler-utils vecz)
   target_compile_definitions(pi_native_cpu PRIVATE NATIVECPU_USE_OCK)
 
 endif()
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
index e3a3cac2ea011..03e7107f54a38 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
@@ -52,6 +52,16 @@ static void runWorkGroupLoops(const sycl::detail::NDRDescT &ndr,
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
   auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
+  const bool localSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
+  if(localSizeOne) {
+    // put everything in one work group, this is just an experiment
+    state.MWorkGroup_size[0] = numWG0;
+    state.MWorkGroup_size[1] = numWG1;
+    state.MWorkGroup_size[2] = numWG2;
+    numWG0 = 1;
+    numWG1 = 1;
+    numWG2 = 1;
+  }
   for (unsigned g2 = 0; g2 < numWG2; g2++) {
     for (unsigned g1 = 0; g1 < numWG1; g1++) {
       for (unsigned g0 = 0; g0 < numWG0; g0++) {

From c5697e7877b53cb4381b359ec662151dc0875c06 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Tue, 31 Oct 2023 14:54:29 +0000
Subject: [PATCH 11/43] Better defaults

---
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp      |  2 +-
 .../ur/adapters/native_cpu/device.cpp               | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 71fa3e85bf329..c01c5537c54f8 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -29,7 +29,7 @@
 
 namespace llvm {
 cl::opt<bool> NativeCPUVecz("ncpu-vecz", cl::init(false), cl::desc("Run vectorizer on SYCL Native CPU"));
-cl::opt<unsigned> NativeCPUVeczWidth("ncpu-vecz-width", cl::init(1), cl::desc("Vector width for SYCL Native CPU vectorizer"));
+cl::opt<unsigned> NativeCPUVeczWidth("ncpu-vecz-width", cl::init(8), cl::desc("Vector width for SYCL Native CPU vectorizer"));
 void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
                                    ModuleAnalysisManager &MAM) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp
index ad51194c212c4..9a6c57e3d1c06 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp
@@ -98,7 +98,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
     return ReturnValue(bool{false});
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS:
-    return ReturnValue(uint32_t{256});
+    // todo: return number of threads in theadpool
+    return ReturnValue(uint32_t{8});
   case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES:
     return ReturnValue(uint32_t{0});
   case UR_DEVICE_INFO_SUPPORTED_PARTITIONS:
@@ -158,14 +159,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT:
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE:
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF:
+  // todo: how can we query vector width in a platform
+  // indipendent way?
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
+    return ReturnValue(uint32_t{32});
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
+    return ReturnValue(uint32_t{16});
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
+    return ReturnValue(uint32_t{8});
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
+    return ReturnValue(uint32_t{4});
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
+    return ReturnValue(uint32_t{8});
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
+    return ReturnValue(uint32_t{4});
   case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
-    return ReturnValue(uint32_t{1});
+    return ReturnValue(uint32_t{16});
 
   // Imported from level_zero
   case UR_DEVICE_INFO_USM_HOST_SUPPORT:

From 986e37ab96e208e6f6da1f145f4335798cf44add Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Wed, 1 Nov 2023 13:41:55 +0000
Subject: [PATCH 12/43] Consistent naming for cmake var

---
 sycl/plugins/native_cpu/CMakeLists.txt      |  9 +++------
 sycl/plugins/unified_runtime/CMakeLists.txt | 10 ++++------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
index 39b5a6b93d957..8015c3bb76452 100644
--- a/sycl/plugins/native_cpu/CMakeLists.txt
+++ b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -37,11 +37,8 @@ add_sycl_plugin(native_cpu
     UnifiedRuntime-Headers
 )
 
-if(NOT DEFINED NATIVECPU_USE_OCK)
-  option(Native_CPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" On)
-endif()
 
-if(Native_CPU_USE_OCK)
+if(NATIVECPU_USE_OCK)
   # TODO: I'm not sure why we need this here, but we get errors when configuring without it 
   if(NOT CMAKE_CLC_COMPILE_OBJECT)
     set(CMAKE_CLC_COMPILE_OBJECT
@@ -60,9 +57,9 @@ if(Native_CPU_USE_OCK)
   )
   FetchContent_GetProperties(oneapi-ck)
   if(NOT oneapi-ck_POPULATED)
-    message("Cloning oneAPI Construction Kit")
+    message(STATUS "Cloning oneAPI Construction Kit")
     FetchContent_Populate(oneapi-ck)
-    message("oneAPI Construction Kit cloned in ${oneapi-ck_SOURCE_DIR}")
+    message(STATUS "oneAPI Construction Kit cloned in ${oneapi-ck_SOURCE_DIR}")
     set(CA_NATIVE_CPU 1)
     add_subdirectory(${oneapi-ck_SOURCE_DIR} ${oneapi-ck_BINARY_DIR})
   endif()
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 7aaba45b36fbb..baf30ada2c7fb 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -198,15 +198,13 @@ if("native_cpu" IN_LIST SYCL_ENABLE_PLUGINS)
       OpenCL-Headers
   )
 
-  if(NOT DEFINED NATIVECPU_USE_OCK)
-    option(Native_CPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" On)
-  endif()
+  option(NATIVECPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" On)
 
-  if(Native_CPU_USE_OCK)
-    message("Compiling Native CPU adapter with OCK support.")
+  if(NATIVECPU_USE_OCK)
+    message(STATUS "Compiling Native CPU adapter with OCK support.")
     target_compile_definitions(ur_adapter_native_cpu PRIVATE NATIVECPU_USE_OCK)
   else()
-    message("Compiling Native CPU adapter without OCK support.
+    message(WARNING "Compiling Native CPU adapter without OCK support.
     Some valid SYCL programs may not build or may have low performance.")
   endif()
 

From f07433fb16a1d57b6d4f3234b8063abd0858fa26 Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Tue, 7 Nov 2023 15:09:46 +0000
Subject: [PATCH 13/43] Enable vectorization by default

---
 clang/include/clang/Basic/LangOptions.def     |  1 +
 clang/include/clang/Driver/Options.td         |  4 +++
 clang/lib/CodeGen/BackendUtil.cpp             |  2 +-
 .../llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h     |  2 +-
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 28 ++++++++++++++----
 .../ur/adapters/native_cpu/enqueue.cpp        | 29 ++++++++++++-------
 .../native_cpu/native_cpu_builtins.cpp        | 12 ++++----
 .../native_cpu/vectorization.cpp              | 21 ++++++++++++++
 sycl/test/native_cpu/link-noinline.cpp        | 10 +++++--
 9 files changed, 84 insertions(+), 25 deletions(-)
 create mode 100644 sycl/test/check_device_code/native_cpu/vectorization.cpp

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index e373ba07008d5..419e975608399 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -298,6 +298,7 @@ LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the "
 LANGOPT(SYCLAllowVirtualFunctions, 1, 0,
         "Allow virtual functions calls in code for SYCL device")
 LANGOPT(SYCLIsNativeCPU      , 1, 0, "Generate code for SYCL NativeCPU")
+LANGOPT(SYCLNativeCPUNoVecz  , 1, 0, "Disable vectorization on SYCL Native CPU")
 
 LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
 LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index eeb9496f684d7..a751c2532d057 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6478,6 +6478,10 @@ def fsycl_fp32_prec_sqrt : Flag<["-"], "fsycl-fp32-prec-sqrt">, Group<sycl_Group
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"SYCL only. Specify that single precision floating-point sqrt is correctly rounded.">,
   MarshallingInfoFlag<CodeGenOpts<"SYCLFp32PrecSqrt">>;
+def fsycl_native_cpu_no_vecz : Flag<["-"], "fsycl-native-cpu-no-vecz">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Disable vectorization on SYCL Native CPU">,
+  MarshallingInfoFlag<LangOpts<"SYCLNativeCPUNoVecz">>;
 
 //===----------------------------------------------------------------------===//
 // FLangOption + NoXarchOption
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 418dc9c9bb3e8..7fce4e46bd43f 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1094,7 +1094,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (SYCLNativeCPUBackend) {
-      addSYCLNativeCPUBackendPasses(MPM, MAM);
+      addSYCLNativeCPUBackendPasses(MPM, MAM, Level.getSpeedupLevel(), LangOpts.SYCLNativeCPUNoVecz);
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
index 93cdd0ae014c3..8379a88d258a6 100644
--- a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
@@ -13,7 +13,7 @@
 
 namespace llvm {
 void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
-                                   ModuleAnalysisManager &MAM);
+                                   ModuleAnalysisManager &MAM, unsigned OptLevel, bool DisableVecz);
 namespace sycl {
 const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id";
 const constexpr char NativeCPUGlobaRange[] =
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 985a227d15d7e..1765eb22c9365 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -28,13 +28,14 @@
 #endif
 
 namespace llvm {
-cl::opt<bool> NativeCPUVecz("ncpu-vecz", cl::init(false), cl::desc("Run vectorizer on SYCL Native CPU"));
-cl::opt<unsigned> NativeCPUVeczWidth("ncpu-vecz-width", cl::init(8), cl::desc("Vector width for SYCL Native CPU vectorizer"));
+cl::opt<unsigned> NativeCPUVeczWidth("ncpu-vecz-width", cl::init(8), cl::desc("Vector width for SYCL Native CPU vectorizer, defaults to 8"));
 void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
-                                   ModuleAnalysisManager &MAM) {
+                                   ModuleAnalysisManager &MAM, unsigned OptLevel, bool DisableVecz) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
-  if(NativeCPUVecz) {
+  // Always enable vectorizer, unless explictly disabled or -O0 is set.
+  llvm::errs() << "[ptrdbg] optl: " << OptLevel << " dis " << DisableVecz << "\n";
+  if(OptLevel != 0 && !DisableVecz) {
     MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
     MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });
     auto queryFunc =
@@ -69,6 +70,23 @@ void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
   // Todo: maybe we could find a set of relevant passes instead of re-running the full 
   // optimization pipeline.
   PassBuilder PB;
-  MPM.addPass(PB.buildPerModuleDefaultPipeline(OptimizationLevel()));
+  OptimizationLevel Level;
+  switch(OptLevel) {
+  case 0:
+    Level = OptimizationLevel::O0;
+    break;
+  case 1:
+    Level = OptimizationLevel::O1;
+    break;
+  case 2:
+    Level = OptimizationLevel::O2;
+    break;
+  case 3:
+    Level = OptimizationLevel::O3;
+    break;
+  default:
+    llvm_unreachable("Unsupported opt level");
+  }
+  MPM.addPass(PB.buildPerModuleDefaultPipeline(Level));
 }
 } // namespace llvm
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
index 55cb0a73d9fe3..79cb444386abc 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
@@ -35,6 +35,12 @@ struct NDRDescT {
       GlobalOffset[I] = 0;
     }
   }
+
+  void dump(std::ostream& os) const {
+    os << "GlobalSize: " << GlobalSize[0] << " " << GlobalSize[1] << " "<< GlobalSize[2] << "\n";
+    os << "LocalSize: " << LocalSize[0] << " " << LocalSize[1] << " "<< LocalSize[2] << "\n";
+    os << "GlobalOffset: " << GlobalOffset[0] << " " << GlobalOffset[1] << " "<< GlobalOffset[2] << "\n";
+  }
 };
 } // namespace native_cpu
 
@@ -49,16 +55,19 @@ static void runWorkGroupLoops(const native_cpu::NDRDescT &ndr,
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
   auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
-  const bool localSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
-  if(localSizeOne) {
-    // put everything in one work group, this is just an experiment
-    state.MWorkGroup_size[0] = numWG0;
-    state.MWorkGroup_size[1] = numWG1;
-    state.MWorkGroup_size[2] = numWG2;
-    numWG0 = 1;
-    numWG1 = 1;
-    numWG2 = 1;
-  }
+  //const bool localSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
+  //const bool onlyDim0 = ndr.GlobalSize[1] == 1 && ndr.GlobalSize[2] == 1;
+  //const bool noOffset = ndr.GlobalOffset[0] == 0 && ndr.GlobalOffset[1] == 0 && ndr.GlobalOffset[2] == 0;
+  //if(localSizeOne && onlyDim0 && noOffset) {
+  //  // put everything in one work group, this is just an experiment
+  //  // Todo: this is unsafe to do if the kernel calls get_global_range/get_local_range.
+  //  state.MWorkGroup_size[0] = numWG0;
+  //  state.MWorkGroup_size[1] = numWG1;
+  //  state.MWorkGroup_size[2] = numWG2;
+  //  numWG0 = 1;
+  //  numWG1 = 1;
+  //  numWG2 = 1;
+  //}
   for (unsigned g2 = 0; g2 < numWG2; g2++) {
     for (unsigned g1 = 0; g1 < numWG1; g1++) {
       for (unsigned g0 = 0; g0 < numWG0; g0++) {
diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
index b7968f66ac94b..741e8b6afa7e4 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
@@ -1,17 +1,17 @@
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
 
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm %s -o %t_temp.ll
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -fno-inline -S -emit-llvm %s -o %t_temp.ll
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
 
 // Check that builtins are emitted as expected
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-BT
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-BT
 
 // check that we added the state struct as a function argument, and that we
 // inject the calls to our builtins.
@@ -27,7 +27,7 @@ int main() {
   sycl::range<1> r(1);
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
-    // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
+    // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}, ptr {{.*}}, ptr addrspace(1){{.*}})
     // CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
     // CHECK-NOT: @llvm.threadlocal
 
diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp
new file mode 100644
index 0000000000000..3630aef92305e
--- /dev/null
+++ b/sycl/test/check_device_code/native_cpu/vectorization.cpp
@@ -0,0 +1,21 @@
+// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DEFAULT
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -ncpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -ncpu-vecz-width=4 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-4
+// RUN: %clangxx -O0 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-O0
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -fsycl-native-cpu-no-vecz -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DISABLE
+#include <sycl/sycl.hpp>
+class Test1;
+int main() {
+  sycl::queue deviceQueue;
+  sycl::accessor<int, 1, sycl::access::mode::write> acc;
+  sycl::range<1> r(1);
+  deviceQueue.submit([&](sycl::handler &h) {
+    h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
+    // CHECK-DEFAULT: store <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+    // CHECK-16: store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+    // CHECK-4: store <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+    // CHECK-O0: store i32 42
+    // CHECK-DISABLE: store i32 42
+  });
+}
diff --git a/sycl/test/native_cpu/link-noinline.cpp b/sycl/test/native_cpu/link-noinline.cpp
index fb4ddd5e0b0ce..ca26b808b776a 100644
--- a/sycl/test/native_cpu/link-noinline.cpp
+++ b/sycl/test/native_cpu/link-noinline.cpp
@@ -56,9 +56,10 @@ int test_all(int expect_val) {
 
 int main() {
   const size_t N = 4;
-  std::array<int, N> C{{0, 0, 0, 0}};
+  std::array<int, N> C{{-6, -6, -6, -6}};
   sycl::queue deviceQueue;
   sycl::range<1> numOfItems{N};
+  {
   sycl::buffer<int, 1> bufferC(C.data(), numOfItems);
 
   if (test_all(HOST_RET) != HOST_RET)
@@ -74,14 +75,19 @@ int main() {
         cgh.parallel_for<class SimpleVadd>(numOfItems, kern);
       })
       .wait();
+  }
 
+  bool pass = true;
   for (unsigned int i = 0; i < N; i++) {
     if (C[i] != DEVICE_RET) {
       std::cout << "The results are incorrect (element " << i << " is " << C[i]
                 << "!\n";
-      return 2;
+      pass = false;
     }
   }
+  if(pass) {
   std::cout << "The results are correct!\n";
   return 0;
+  }
+  return 2;
 }

From 9f4a6b9b0c578d67fe7b98e9a7b1c5b49214130f Mon Sep 17 00:00:00 2001
From: "pietro.ghiglio" <pietro.ghiglio@codeplay.com>
Date: Tue, 7 Nov 2023 16:46:55 +0000
Subject: [PATCH 14/43] formatting

---
 clang/lib/CodeGen/BackendUtil.cpp              |  2 +-
 .../llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h      | 15 ++++++++-------
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp |  2 +-
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp  | 18 ++++++++++--------
 4 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 418dc9c9bb3e8..415c009883972 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -49,10 +49,10 @@
 #include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h"
 #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
 #include "llvm/SYCLLowerIR/MutatePrintfAddrspace.h"
-#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h"
+#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
index 93cdd0ae014c3..f2b8ee6c3c514 100644
--- a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
@@ -20,15 +20,16 @@ const constexpr char NativeCPUGlobaRange[] =
     "__dpcpp_nativecpu_get_global_range";
 const constexpr char NativeCPUGlobalOffset[] =
     "__dpcpp_nativecpu_get_global_offset";
-const constexpr char NativeCPULocalId[] =
-    "__dpcpp_nativecpu_get_local_id";
-const constexpr char NativeCPUNumGroups[] =
-    "__dpcpp_nativecpu_get_num_groups";
+const constexpr char NativeCPULocalId[] = "__dpcpp_nativecpu_get_local_id";
+const constexpr char NativeCPUNumGroups[] = "__dpcpp_nativecpu_get_num_groups";
 const constexpr char NativeCPUWGSize[] = "__dpcpp_nativecpu_get_wg_size";
 const constexpr char NativeCPUWGId[] = "__dpcpp_nativecpu_get_wg_id";
-const constexpr char NativeCPUSetNumSubgroups[] = "__dpcpp_nativecpu_set_num_sub_groups";
-const constexpr char NativeCPUSetSubgroupId[] = "__dpcpp_nativecpu_set_sub_group_id";
-const constexpr char NativeCPUSetMaxSubgroupSize[] = "__dpcpp_nativecpu_set_max_sub_group_size";
+const constexpr char NativeCPUSetNumSubgroups[] =
+    "__dpcpp_nativecpu_set_num_sub_groups";
+const constexpr char NativeCPUSetSubgroupId[] =
+    "__dpcpp_nativecpu_set_sub_group_id";
+const constexpr char NativeCPUSetMaxSubgroupSize[] =
+    "__dpcpp_nativecpu_set_max_sub_group_size";
 const constexpr char NativeCPUSetLocalId[] = "__dpcpp_nativecpu_set_local_id";
 } // namespace sycl
 } // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 0947fce6af9b5..89bcaea88042b 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -11,10 +11,10 @@
 // When NATIVECPU_USE_OCK is set, adds passes from the oneAPI Construction Kit.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index 03508940c428d..de490c9ab09ea 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -236,7 +236,8 @@ static Function *addSetLocalIdFunc(Module &M, StringRef Name, Type *StateType) {
   Type *DimTy = I32Ty;
   Type *ValTy = I64Ty;
   Type *PtrTy = PointerType::get(Ctx, NativeCPUGlobalAS);
-  static FunctionType *FTy = FunctionType::get(RetTy, {DimTy, ValTy, PtrTy}, false);
+  static FunctionType *FTy =
+      FunctionType::get(RetTy, {DimTy, ValTy, PtrTy}, false);
   auto FCallee = M.getOrInsertFunction(Name, FTy);
   auto *F = dyn_cast<Function>(FCallee.getCallee());
   IRBuilder<> Builder(Ctx);
@@ -246,15 +247,15 @@ static Function *addSetLocalIdFunc(Module &M, StringRef Name, Type *StateType) {
   auto *IdxProm = Builder.CreateZExt(F->getArg(0), DimTy, "idxprom");
   auto *Zero = ConstantInt::get(I64Ty, 0);
   auto *Offset = ConstantInt::get(I32Ty, OffsetMap.at(NativeCPULocalId));
-  auto *GEP =
-      Builder.CreateGEP(StateType, StatePtr, {Zero, Offset, IdxProm});
+  auto *GEP = Builder.CreateGEP(StateType, StatePtr, {Zero, Offset, IdxProm});
   // store local id
   auto *Val = F->getArg(1);
   Builder.CreateStore(Val, GEP);
   // update global id
   auto loadHelper = [&](const char *BTName) {
     auto *Offset = ConstantInt::get(I32Ty, OffsetMap.at(BTName));
-    auto *Addr = Builder.CreateGEP(StateType, StatePtr, {Zero, Offset, IdxProm});
+    auto *Addr =
+        Builder.CreateGEP(StateType, StatePtr, {Zero, Offset, IdxProm});
     auto *Load = Builder.CreateLoad(I64Ty, Addr);
     return Load;
   };
@@ -264,7 +265,8 @@ static Function *addSetLocalIdFunc(Module &M, StringRef Name, Type *StateType) {
   auto *Mul = Builder.CreateMul(WGId, WGSize);
   auto *GId = Builder.CreateAdd(Builder.CreateAdd(Mul, GlobalOffset), Val);
   auto *GIdOffset = ConstantInt::get(I32Ty, OffsetMap.at(NativeCPUGlobalId));
-  auto *GIdAddr = Builder.CreateGEP(StateType, StatePtr, {Zero, GIdOffset, IdxProm});
+  auto *GIdAddr =
+      Builder.CreateGEP(StateType, StatePtr, {Zero, GIdOffset, IdxProm});
   Builder.CreateStore(GId, GIdAddr);
   Builder.CreateRetVoid();
   return F;
@@ -293,14 +295,14 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) {
   return F;
 }
 
-static Function* addReplaceFunc(Module& M, StringRef Name, Type *StateType) {
+static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) {
   Function *Res;
-  if(Name.startswith("__dpcpp_nativecpu_get")) {
+  if (Name.startswith("__dpcpp_nativecpu_get")) {
     Res = addGetFunc(M, Name, StateType);
   } else if (Name == NativeCPUSetLocalId) {
     Res = addSetLocalIdFunc(M, Name, StateType);
   } else {
-    // the other __dpcpp_nativecpu_set* builtins are subgroup-related and 
+    // the other __dpcpp_nativecpu_set* builtins are subgroup-related and
     // not supported yet, emit empty functions for now.
     auto &Ctx = M.getContext();
     Type *I32Ty = Type::getInt32Ty(Ctx);

From f2b634e403e70912807d8648ee8d069b5fc43b30 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Wed, 8 Nov 2023 12:43:24 +0000
Subject: [PATCH 15/43] Remove debug print

---
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 1765eb22c9365..6d34af1f1cbb3 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -34,7 +34,6 @@ void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
   // Always enable vectorizer, unless explictly disabled or -O0 is set.
-  llvm::errs() << "[ptrdbg] optl: " << OptLevel << " dis " << DisableVecz << "\n";
   if(OptLevel != 0 && !DisableVecz) {
     MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
     MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });

From 6529479a9235e5e6ecd96af68942b9c633bbd3a0 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Thu, 9 Nov 2023 14:18:10 +0000
Subject: [PATCH 16/43] Test updated OCK branch

---
 sycl/plugins/native_cpu/CMakeLists.txt        | 4 ++--
 sycl/test/native_cpu/sycl-external-static.cpp | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
index 8015c3bb76452..ce6d82fd02922 100644
--- a/sycl/plugins/native_cpu/CMakeLists.txt
+++ b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -52,8 +52,8 @@ if(NATIVECPU_USE_OCK)
 
   include(FetchContent)
   FetchContent_Declare(oneapi-ck
-   GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git
-   GIT_TAG        sycl_native_experimental
+   GIT_REPOSITORY https://github.com/PietroGhg/oneapi-construction-kit.git
+   GIT_TAG       pietro/nativecpu_update 
   )
   FetchContent_GetProperties(oneapi-ck)
   if(NOT oneapi-ck_POPULATED)
diff --git a/sycl/test/native_cpu/sycl-external-static.cpp b/sycl/test/native_cpu/sycl-external-static.cpp
index a8409cc1f3ac2..e957144823e43 100644
--- a/sycl/test/native_cpu/sycl-external-static.cpp
+++ b/sycl/test/native_cpu/sycl-external-static.cpp
@@ -7,9 +7,6 @@
 // RUN: llvm-ar crv %t.a %t1.o
 // RUN: %clangxx -fsycl -fsycl-targets=native_cpu %t2.o %t.a -o %t
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
-// This currently fails because we have a static array of functions which
-// we do not support when handling barriers.
-// XFAIL: *
 
 #include <iostream>
 #include <sycl/sycl.hpp>

From 9023de63fbdbda597cad86127eb69ed6e2d37d41 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Thu, 9 Nov 2023 16:26:56 +0000
Subject: [PATCH 17/43] Restore real ock tag

---
 sycl/plugins/native_cpu/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
index ce6d82fd02922..8015c3bb76452 100644
--- a/sycl/plugins/native_cpu/CMakeLists.txt
+++ b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -52,8 +52,8 @@ if(NATIVECPU_USE_OCK)
 
   include(FetchContent)
   FetchContent_Declare(oneapi-ck
-   GIT_REPOSITORY https://github.com/PietroGhg/oneapi-construction-kit.git
-   GIT_TAG       pietro/nativecpu_update 
+   GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git
+   GIT_TAG        sycl_native_experimental
   )
   FetchContent_GetProperties(oneapi-ck)
   if(NOT oneapi-ck_POPULATED)

From 8fc2392ffd62c8dff98e7e57037d47d7d29913cb Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Thu, 16 Nov 2023 08:41:01 +0000
Subject: [PATCH 18/43] Move utily functions to UtilsSYCLNativeCPU.h

---
 clang/lib/CodeGen/BackendUtil.cpp             |  2 +-
 .../ClangOffloadWrapper.cpp                   |  1 +
 llvm/include/llvm/SYCLLowerIR/SYCLUtils.h     |  8 --------
 .../llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h     | 19 +++++++++++++++++--
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp |  7 ++++---
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp |  4 +++-
 .../SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp |  1 +
 7 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 969e964c7528b..9f71c63d30eec 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1098,7 +1098,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (SYCLNativeCPUBackend) {
-      addSYCLNativeCPUBackendPasses(MPM, MAM);
+      sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM);
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
index 5c8328865601b..31ca1662dec01 100644
--- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
+++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
+#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Errc.h"
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
index 0d4ffd36a6313..c9ebcdae53f4b 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
@@ -117,14 +117,6 @@ inline bool isSYCLExternalFunction(const Function *F) {
   return F->hasFnAttribute(ATTR_SYCL_MODULE_ID);
 }
 
-constexpr char SYCLNATIVECPUSUFFIX[] = ".SYCLNCPU";
-constexpr char SYCLNATIVECPUKERNEL[] = ".NativeCPUKernel";
-inline llvm::Twine addSYCLNativeCPUSuffix(StringRef S) {
-  if (S.startswith("__dpcpp_nativecpu") || S.endswith(SYCLNATIVECPUKERNEL))
-    return S;
-  return llvm::Twine(S, SYCLNATIVECPUSUFFIX);
-}
-
 } // namespace utils
 } // namespace sycl
 } // namespace llvm
diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
index f2b8ee6c3c514..fee9be47c6c6b 100644
--- a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
@@ -9,12 +9,16 @@
 // Utility functions and constants for SYCL Native CPU.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/Target/TargetMachine.h"
+#pragma once
+#include "llvm/IR/PassManager.h"
+#include "llvm/ADT/Twine.h"
 
 namespace llvm {
+namespace sycl {
+namespace utils {
+
 void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
                                    ModuleAnalysisManager &MAM);
-namespace sycl {
 const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id";
 const constexpr char NativeCPUGlobaRange[] =
     "__dpcpp_nativecpu_get_global_range";
@@ -31,5 +35,16 @@ const constexpr char NativeCPUSetSubgroupId[] =
 const constexpr char NativeCPUSetMaxSubgroupSize[] =
     "__dpcpp_nativecpu_set_max_sub_group_size";
 const constexpr char NativeCPUSetLocalId[] = "__dpcpp_nativecpu_set_local_id";
+
+constexpr char SYCLNATIVECPUSUFFIX[] = ".SYCLNCPU";
+constexpr char SYCLNATIVECPUKERNEL[] = ".NativeCPUKernel";
+constexpr char SYCLNATIVECPUPREFIX[] = "__dpcpp_nativecpu";
+inline llvm::Twine addSYCLNativeCPUSuffix(StringRef S) {
+  if (S.startswith(SYCLNATIVECPUPREFIX) || S.endswith(SYCLNATIVECPUKERNEL))
+    return S;
+  return llvm::Twine(S, SYCLNATIVECPUSUFFIX);
+}
+
+} // namespace utils
 } // namespace sycl
 } // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 89bcaea88042b..386eee5f1bcf3 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -23,8 +23,10 @@
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #endif
 
-namespace llvm {
-void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
+using namespace llvm;
+using namespace sycl::utils;
+
+void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
                                    ModuleAnalysisManager &MAM) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
@@ -41,4 +43,3 @@ void addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
   MPM.addPass(PrepareSYCLNativeCPUPass());
   MPM.addPass(RenameKernelSYCLNativeCPUPass());
 }
-} // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index bbc2516ffac3d..fd65f633aacb9 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -56,6 +56,7 @@
 
 using namespace llvm;
 using namespace sycl;
+using namespace sycl::utils;
 
 namespace {
 
@@ -284,7 +285,8 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) {
 
 static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) {
   Function *Res;
-  if (Name.startswith("__dpcpp_nativecpu_get")) {
+  const char GetPrefix[] = "__dpcpp_nativecpu_get";
+  if (Name.startswith(GetPrefix)) {
     Res = addGetFunc(M, Name, StateType);
   } else if (Name == NativeCPUSetLocalId) {
     Res = addSetLocalIdFunc(M, Name, StateType);
diff --git a/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp
index f9a06457b4eea..968a75ea476b2 100644
--- a/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
+#include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 #include <set>
 
 using namespace llvm;

From 11de7b31a07de613e60e8887bb4fd8b0c50711ec Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Thu, 16 Nov 2023 08:47:23 +0000
Subject: [PATCH 19/43] Consistent naming in docs

---
 sycl/doc/design/SYCLNativeCPU.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index 7c11204c296f7..6c4ddb27f24a9 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -38,7 +38,7 @@ python buildbot/configure.py \
 # other options here
 ```
 
-SYCL Native CPU uses the [oneAPI Construction Kit](https://github.com/codeplaysoftware/oneapi-construction-kit) (OCK) in order to support some core SYCL functionalities and improve performances, the OCK is fetched by default when Native CPU is enabled, and can optionally be disabled using the `NATIVECPU_USE_OCK` CMake variable (please note that disabling the OCK will result in limited functionalities and performances on the Native CPU backend):
+SYCL Native CPU uses the [oneAPI Construction Kit](https://github.com/codeplaysoftware/oneapi-construction-kit) (OCK) in order to support some core SYCL functionalities and improve performances, the OCK is fetched by default when SYCL Native CPU is enabled, and can optionally be disabled using the `NATIVECPU_USE_OCK` CMake variable (please note that disabling the OCK will result in limited functionalities and performances on the SYCL Native CPU backend):
 
 ```
 python3 buildbot/configure.py \
@@ -46,14 +46,14 @@ python3 buildbot/configure.py \
   --cmake-opt=-DNATIVE_CPU_USE_OCK=Off
 ```
 
-The Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. 
+The SYCL Native CPU device needs to be selected at runtime by setting the environment variable `ONEAPI_DEVICE_SELECTOR=native_cpu:cpu`. 
 
 # Supported features and current limitations
 
 The SYCL Native CPU flow is still WIP, not optimized and several core SYCL features are currently unsupported. Currently `barriers` are supported only when the oneAPI Construction Kit integration is enabled, several math builtins are not supported and attempting to use those will most likely fail with an `undefined reference` error at link time. Examples of supported applications can be found in the [runtime tests](https://github.com/intel/llvm/blob/sycl/sycl/test/native_cpu).
 
 
-To execute the `e2e` tests on the Native CPU, configure the test suite with:
+To execute the `e2e` tests on SYCL Native CPU, configure the test suite with:
 
 ```bash
 # make sure that DPC++ is in your $PATH and your environment is configured for DPC++
@@ -81,7 +81,7 @@ Note that a number of `e2e` tests are currently still failing.
 
 # Technical details
 
-The following section gives a brief overview of how a simple SYCL application is compiled for the Native CPU target. Consider the following SYCL sample, which performs vector addition using USM:
+The following section gives a brief overview of how a simple SYCL application is compiled for the SYCL Native CPU target. Consider the following SYCL sample, which performs vector addition using USM:
 
 ```c++
   cl::sycl::queue deviceQueue;
@@ -122,7 +122,7 @@ entry:
 }
 ```
 
-For the Native CPU target, the device compiler is in charge of materializing the SPIRV builtins (such as `@__spirv_BuiltInGlobalInvocationId`), so that they can be correctly updated by the runtime when executing the kernel. This is performed by the [PrepareSYCLNativeCPU pass](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp).
+For the SYCL Native CPU target, the device compiler is in charge of materializing the SPIRV builtins (such as `@__spirv_BuiltInGlobalInvocationId`), so that they can be correctly updated by the runtime when executing the kernel. This is performed by the [PrepareSYCLNativeCPU pass](https://github.com/intel/llvm/blob/sycl/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp).
 The PrepareSYCLNativeCPUPass also emits a `subhandler` function, which receives the kernel arguments from the SYCL runtime (packed in a vector), unpacks them, and forwards only the used ones to the actual kernel. 
 
 
@@ -152,7 +152,7 @@ entry:
 ```
 This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime.
 
-The `subhandler` for the Native CPU kernel looks like: 
+The `subhandler` for the SYCL Native CPU kernel looks like: 
 
 ```llvm
 define weak void @_Z6Sample(ptr %0, ptr %1) #4 {
@@ -177,7 +177,7 @@ On SYCL Native CPU, calls to `__spirv_ControlBarrier` are handled using the `Wor
 
 ## Kernel registration
 
-In order to register the Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array.
+In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the SYCL Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array.
 
 ```
  -------------------------------------------------------
@@ -193,5 +193,5 @@ Each entry in the array contains the kernel name as a string, and a pointer to t
 
 ## Kernel lowering and execution
 
-The information produced by the device compiler is then employed to correctly lower the kernel LLVM-IR module to the target ISA (this is performed by the driver when `-fsycl-targets=native_cpu` is set). The object file containing the kernel code is linked with the host object file (and libsycl and any other needed library) and the final executable is ran using the Native CPU PI Plug-in, defined in [pi_native_cpu.cpp](https://github.com/intel/llvm/blob/sycl/sycl/plugins/native_cpu/pi_native_cpu.cpp).
+The information produced by the device compiler is then employed to correctly lower the kernel LLVM-IR module to the target ISA (this is performed by the driver when `-fsycl-targets=native_cpu` is set). The object file containing the kernel code is linked with the host object file (and libsycl and any other needed library) and the final executable is ran using the SYCL Native CPU PI Plug-in, defined in [pi_native_cpu.cpp](https://github.com/intel/llvm/blob/sycl/sycl/plugins/native_cpu/pi_native_cpu.cpp).
 

From 376556d0bd4e4d54d098a3c93774718d40cf3027 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 17 Nov 2023 15:58:02 +0000
Subject: [PATCH 20/43] change fixCallingConv name

---
 llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
index 605ae084a96fd..529c732d5226b 100644
--- a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
@@ -25,7 +25,7 @@ using namespace llvm;
 
 namespace {
 
-static void fixCallingConv(Function *F) {
+static void fixFunctionAttributes(Function *F) {
   // The frame-pointer=all and the "byval" attributes lead to code generation
   // that conflicts with the Kernel declaration that we emit in the Native CPU
   // helper header (in which all the kernel argument are void* or scalars).
@@ -158,7 +158,7 @@ ConvertToMuxBuiltinsSYCLNativeCPUPass::run(Module &M,
   bool ModuleChanged = false;
   for (auto &F : M) {
     if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
-      fixCallingConv(&F);
+      fixFunctionAttributes(&F);
       setIsKernelEntryPt(F);
     }
   }

From 93690b647e921a0cf2d4f6c1c78196800992e731 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 17 Nov 2023 16:00:23 +0000
Subject: [PATCH 21/43] Check after dyn_cast

---
 .../SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
index 529c732d5226b..2eccac0001507 100644
--- a/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <map>
 
@@ -94,8 +95,12 @@ Function *getMuxBarrierFunc(Module &M) {
   auto *Int32Ty = Type::getInt32Ty(Ctx);
   static auto *MuxFTy = FunctionType::get(Type::getVoidTy(Ctx),
                                           {Int32Ty, Int32Ty, Int32Ty}, false);
-  auto F = M.getOrInsertFunction(MuxBarrier, MuxFTy);
-  return cast<Function>(F.getCallee());
+  auto FCallee = M.getOrInsertFunction(MuxBarrier, MuxFTy);
+  auto *F = dyn_cast<Function>(FCallee.getCallee());
+  if(!F) {
+    report_fatal_error("Error while inserting mux builtins");
+  }
+  return F;
 }
 
 static constexpr const char *MuxKernelAttrName = "mux-kernel";

From ab3e15406ffce6f9b0b040f9dfc4e452c63136ef Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 17 Nov 2023 16:31:39 +0000
Subject: [PATCH 22/43] remove libclc-relatd cmake from native cpu cmake

---
 sycl/plugins/native_cpu/CMakeLists.txt | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
index 26eaa7fb3fdcb..6cb3824652ba2 100644
--- a/sycl/plugins/native_cpu/CMakeLists.txt
+++ b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -26,17 +26,6 @@ add_sycl_plugin(native_cpu
 
 
 if(NATIVECPU_USE_OCK)
-  # TODO: I'm not sure why we need this here, but we get errors when configuring without it 
-  if(NOT CMAKE_CLC_COMPILE_OBJECT)
-    set(CMAKE_CLC_COMPILE_OBJECT
-      "<CMAKE_CLC_COMPILER> <DEFINES> <INCLUDES> <FLAGS> -o <OBJECT> -c <SOURCE> -emit-llvm")
-  endif()
-  if(NOT CMAKE_CLC_CREATE_STATIC_LIBRARY)
-    set(CMAKE_CLC_CREATE_STATIC_LIBRARY
-      "<CMAKE_CLC_ARCHIVE> -o <TARGET> <OBJECTS>")
-  endif()
-  set(CMAKE_INCLUDE_FLAG_CLC "-I")
-
   include(FetchContent)
   FetchContent_Declare(oneapi-ck
    GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git

From ef1e9205b461a8561ea4a4e00baa498a84184b45 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 17 Nov 2023 16:36:32 +0000
Subject: [PATCH 23/43] Put back O2 in lit test

---
 .../test/check_device_code/native_cpu/native_cpu_subhandler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
index b4c3bf9a271df..112cfa625cc67 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s
+// RUN: %clangxx -fsycl-device-only -O2 -g -fexceptions  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s
 // RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
 // Checks that the subhandler is correctly emitted in the module

From e8d7e3b9beacb1e7c4b7a997229261f3bb3dc6fa Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 17 Nov 2023 16:44:07 +0000
Subject: [PATCH 24/43] Update docs

---
 sycl/doc/design/SYCLNativeCPU.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index 6c4ddb27f24a9..a1706182f7d2f 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -42,7 +42,7 @@ SYCL Native CPU uses the [oneAPI Construction Kit](https://github.com/codeplayso
 
 ```
 python3 buildbot/configure.py \
-  --enable-plugin native_cpu \
+  --native_cpu \
   --cmake-opt=-DNATIVE_CPU_USE_OCK=Off
 ```
 
@@ -193,5 +193,5 @@ Each entry in the array contains the kernel name as a string, and a pointer to t
 
 ## Kernel lowering and execution
 
-The information produced by the device compiler is then employed to correctly lower the kernel LLVM-IR module to the target ISA (this is performed by the driver when `-fsycl-targets=native_cpu` is set). The object file containing the kernel code is linked with the host object file (and libsycl and any other needed library) and the final executable is ran using the SYCL Native CPU PI Plug-in, defined in [pi_native_cpu.cpp](https://github.com/intel/llvm/blob/sycl/sycl/plugins/native_cpu/pi_native_cpu.cpp).
+The information produced by the device compiler is then employed to correctly lower the kernel LLVM-IR module to the target ISA (this is performed by the driver when `-fsycl-targets=native_cpu` is set). The object file containing the kernel code is linked with the host object file (and libsycl and any other needed library) and the final executable is run using the SYCL Native CPU UR Adapter, defined in [the Unified Runtime repo](https://github.com/oneapi-src/unified-runtime/tree/adapters/source/adapters/native_cpu).
 

From e638b8866ca14458955a3d23842328ecfd72f145 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 17 Nov 2023 17:10:34 +0000
Subject: [PATCH 25/43] check after dyn cast

---
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 13 ++++++++++---
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp  |  9 +++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 386eee5f1bcf3..82e540f77ac9f 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -26,14 +26,21 @@
 using namespace llvm;
 using namespace sycl::utils;
 
+cl::opt<bool> ForceNoTail("native-cpu-force-no-tail", cl::init(false),
+    cl::desc("Never emit the peeling loop for vectorized kernels,"
+    "even when the local size is not known to be a multiple of the vector width"));
+
+cl::opt<bool> IsDebug("native-cpu-debug", cl::init(false),
+    cl::desc("Emit extra alloca instructions to preserve the value of live"
+    "vriables between barriers"));
+
 void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
                                    ModuleAnalysisManager &MAM) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
-  // Todo set options properly
   compiler::utils::WorkItemLoopsPassOptions Opts;
-  Opts.IsDebug = false;
-  Opts.ForceNoTail = false;
+  Opts.IsDebug = IsDebug;
+  Opts.ForceNoTail = ForceNoTail;
   MAM.registerPass([&] { return compiler::utils::BuiltinInfoAnalysis(); });
   MAM.registerPass([&] { return compiler::utils::SubgroupAnalysis(); });
   MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index fd65f633aacb9..e2f0dc3eace86 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -228,6 +228,9 @@ static Function *addSetLocalIdFunc(Module &M, StringRef Name, Type *StateType) {
       FunctionType::get(RetTy, {DimTy, ValTy, PtrTy}, false);
   auto FCallee = M.getOrInsertFunction(Name, FTy);
   auto *F = dyn_cast<Function>(FCallee.getCallee());
+  if(!F) {
+    report_fatal_error("Error while replacing mux builtins");
+  }
   IRBuilder<> Builder(Ctx);
   BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
   Builder.SetInsertPoint(BB);
@@ -270,6 +273,9 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) {
   static FunctionType *FTy = FunctionType::get(RetTy, {DimTy, PtrTy}, false);
   auto FCallee = M.getOrInsertFunction(Name, FTy);
   auto *F = dyn_cast<Function>(FCallee.getCallee());
+  if(!F) {
+    report_fatal_error("Error while replacing mux builtins");
+  }
   IRBuilder<> Builder(Ctx);
   BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
   Builder.SetInsertPoint(BB);
@@ -301,6 +307,9 @@ static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) {
     static FunctionType *FTy = FunctionType::get(RetTy, {ValTy, PtrTy}, false);
     auto FCallee = M.getOrInsertFunction(Name, FTy);
     auto *F = dyn_cast<Function>(FCallee.getCallee());
+    if(!F) {
+      report_fatal_error("Error while replacing mux builtins");
+    }
     IRBuilder<> Builder(Ctx);
     BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
     Builder.SetInsertPoint(BB);

From 49ed3e73a655c62de8d2524ac59f927543f56f62 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Mon, 20 Nov 2023 10:59:40 +0000
Subject: [PATCH 26/43] Use llvm::cast where appropriate

---
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 23 ++++---------------
 1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index e2f0dc3eace86..d100518850b2a 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -227,10 +227,7 @@ static Function *addSetLocalIdFunc(Module &M, StringRef Name, Type *StateType) {
   static FunctionType *FTy =
       FunctionType::get(RetTy, {DimTy, ValTy, PtrTy}, false);
   auto FCallee = M.getOrInsertFunction(Name, FTy);
-  auto *F = dyn_cast<Function>(FCallee.getCallee());
-  if(!F) {
-    report_fatal_error("Error while replacing mux builtins");
-  }
+  auto *F = cast<Function>(FCallee.getCallee());
   IRBuilder<> Builder(Ctx);
   BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
   Builder.SetInsertPoint(BB);
@@ -272,10 +269,7 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) {
   Type *PtrTy = PointerType::get(Ctx, NativeCPUGlobalAS);
   static FunctionType *FTy = FunctionType::get(RetTy, {DimTy, PtrTy}, false);
   auto FCallee = M.getOrInsertFunction(Name, FTy);
-  auto *F = dyn_cast<Function>(FCallee.getCallee());
-  if(!F) {
-    report_fatal_error("Error while replacing mux builtins");
-  }
+  auto *F = cast<Function>(FCallee.getCallee());
   IRBuilder<> Builder(Ctx);
   BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
   Builder.SetInsertPoint(BB);
@@ -306,10 +300,7 @@ static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) {
     Type *PtrTy = PointerType::get(Ctx, NativeCPUGlobalAS);
     static FunctionType *FTy = FunctionType::get(RetTy, {ValTy, PtrTy}, false);
     auto FCallee = M.getOrInsertFunction(Name, FTy);
-    auto *F = dyn_cast<Function>(FCallee.getCallee());
-    if(!F) {
-      report_fatal_error("Error while replacing mux builtins");
-    }
+    auto *F = cast<Function>(FCallee.getCallee());
     IRBuilder<> Builder(Ctx);
     BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
     Builder.SetInsertPoint(BB);
@@ -371,9 +362,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
     if (!Glob)
       continue;
     for (const auto &Use : Glob->uses()) {
-      auto I = dyn_cast<CallInst>(Use.getUser());
-      if (!I)
-        report_fatal_error("Unsupported Value in SYCL Native CPU\n");
+      auto I = cast<CallInst>(Use.getUser());
       if (!IsNativeCPUKernel(I->getFunction())) {
         // only use the threadlocal if we have kernels calling builtins
         // indirectly
@@ -436,9 +425,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
     Function *const Glob = Entry.first;
     for (const auto &Use : Glob->uses()) {
       auto *ReplaceFunc = getReplaceFunc(M, Entry.second, StateType);
-      auto I = dyn_cast<CallInst>(Use.getUser());
-      if (!I)
-        report_fatal_error("Unsupported Value in SYCL Native CPU\n");
+      auto I = cast<CallInst>(Use.getUser());
       SmallVector<Value *> Args(I->arg_begin(), I->arg_end());
       Args.push_back(getStateArg(I->getFunction(), CurrentStatePointerTLS));
       auto *NewI = CallInst::Create(ReplaceFunc->getFunctionType(), ReplaceFunc,

From 27936cdf9244752aa17b4bcfc9a61a90cc4540ba Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Tue, 28 Nov 2023 10:49:17 +0000
Subject: [PATCH 27/43] typo

---
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index a36d1688d648f..12732ed4df31a 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -454,7 +454,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
   }
 
 #ifdef NATIVECPU_USE_OCK
-  // Define __mum_mem_barrier here using the OCK
+  // Define __mux_mem_barrier here using the OCK
   compiler::utils::BuiltinInfo BI;
   for (auto &F : M) {
     if (F.getName() == compiler::utils::MuxBuiltins::mem_barrier) {

From 91c9b774e2a860ad9730d96dfe29daecc0547492 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 1 Dec 2023 16:09:22 +0000
Subject: [PATCH 28/43] use llvm option for disabling vecz

---
 clang/include/clang/Basic/LangOptions.def                | 3 +--
 clang/include/clang/Driver/Options.td                    | 4 ----
 clang/lib/CodeGen/BackendUtil.cpp                        | 6 +++++-
 sycl/test/check_device_code/native_cpu/vectorization.cpp | 3 ++-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 16647342b92f3..3aeb1ef43d797 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -299,8 +299,7 @@ LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the "
         "SYCL integration header")
 LANGOPT(SYCLAllowVirtualFunctions, 1, 0,
         "Allow virtual functions calls in code for SYCL device")
-LANGOPT(SYCLIsNativeCPU      , 1, 0, "Generate code for SYCL NativeCPU")
-LANGOPT(SYCLNativeCPUNoVecz  , 1, 0, "Disable vectorization on SYCL Native CPU")
+LANGOPT(SYCLIsNativeCPU      , 1, 0, "Generate code for SYCL Native CPU")
 
 LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
 LANGOPT(OffloadUniformBlock, 1, 0, "Assume that kernels are launched with uniform block sizes (default true for CUDA/HIP and false otherwise)")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index ffd8e3eceeb0c..c21841cc29362 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6516,10 +6516,6 @@ def fsycl_fp32_prec_sqrt : Flag<["-"], "fsycl-fp32-prec-sqrt">, Group<sycl_Group
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"SYCL only. Specify that single precision floating-point sqrt is correctly rounded.">,
   MarshallingInfoFlag<CodeGenOpts<"SYCLFp32PrecSqrt">>;
-def fsycl_native_cpu_no_vecz : Flag<["-"], "fsycl-native-cpu-no-vecz">,
-  Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Disable vectorization on SYCL Native CPU">,
-  MarshallingInfoFlag<LangOpts<"SYCLNativeCPUNoVecz">>;
 
 //===----------------------------------------------------------------------===//
 // FLangOption + NoXarchOption
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 987b7ad31576b..b0a00721eac41 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -118,6 +118,10 @@ static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
 static cl::opt<bool> SYCLNativeCPUBackend(
     "sycl-native-cpu-backend", cl::init(false),
     cl::desc("Run the backend passes for SYCL Native CPU"));
+
+static cl::opt<bool> SYCLNativeCPUNoVecz(
+    "sycl-native-cpu-no-vecz", cl::init(false),
+    cl::desc("Disable vectorizer for SYCL Native CPU"));
 }
 
 namespace {
@@ -1069,7 +1073,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (SYCLNativeCPUBackend) {
-      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM, Level.getSpeedupLevel(), LangOpts.SYCLNativeCPUNoVecz);
+      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM, Level.getSpeedupLevel(), SYCLNativeCPUNoVecz);
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp
index 3630aef92305e..7532b5e38fe54 100644
--- a/sycl/test/check_device_code/native_cpu/vectorization.cpp
+++ b/sycl/test/check_device_code/native_cpu/vectorization.cpp
@@ -3,7 +3,7 @@
 // RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -ncpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16
 // RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -ncpu-vecz-width=4 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-4
 // RUN: %clangxx -O0 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-O0
-// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -fsycl-native-cpu-no-vecz -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DISABLE
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-no-vecz -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DISABLE
 #include <sycl/sycl.hpp>
 class Test1;
 int main() {
@@ -17,5 +17,6 @@ int main() {
     // CHECK-4: store <4 x i32> <i32 42, i32 42, i32 42, i32 42>
     // CHECK-O0: store i32 42
     // CHECK-DISABLE: store i32 42
+    // CHECK-DISABLE-NOT: store <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
   });
 }

From b1f3fcd3e8954f1c5a5ddc81e23829bdbf242c34 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Wed, 31 Jan 2024 13:24:13 +0000
Subject: [PATCH 29/43] Update OCK tag

---
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 6 +++---
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp  | 2 +-
 sycl/plugins/native_cpu/CMakeLists.txt         | 9 +++++++--
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 41748ed72fa71..4e43187f8e833 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -50,15 +50,15 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &M
     MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
     MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });
     auto queryFunc =
-        [](llvm::Function &F, llvm::ModuleAnalysisManager &,
+        [](const llvm::Function &F, const llvm::ModuleAnalysisManager &,
            llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) -> bool {
       if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
         return false;
       }
       compiler::utils::VectorizationFactor VF(NativeCPUVeczWidth, false);
       vecz::VeczPassOptions VPO;
-      VPO.factor = VF;
-      Opts.emplace_back(VPO);
+      VPO.factor = std::move(VF);
+      Opts.emplace_back(std::move(VPO));
       return true;
     };
     MAM.registerPass([&] { return vecz::VeczPassOptionsAnalysis(queryFunc); });
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index 90119ba5371d6..581e322494d59 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -348,7 +348,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
 
   // check if any of the kernels is called by some other function.
   // This can happen e.g. with OCK, where wrapper functions are 
-  // created around the origianl kernel.
+  // created around the original kernel.
   bool KernelIsCalled = false;
   for(auto& K : OldKernels) {
     for(auto& U : K->uses()){
diff --git a/sycl/plugins/native_cpu/CMakeLists.txt b/sycl/plugins/native_cpu/CMakeLists.txt
index 37a174c4e379e..87d18f864b55b 100644
--- a/sycl/plugins/native_cpu/CMakeLists.txt
+++ b/sycl/plugins/native_cpu/CMakeLists.txt
@@ -27,8 +27,13 @@ add_sycl_plugin(native_cpu
 if(NATIVECPU_USE_OCK)
   include(FetchContent)
   FetchContent_Declare(oneapi-ck
-   GIT_REPOSITORY https://github.com/PietroGhg/oneapi-construction-kit.git
-   GIT_TAG        pietro/vecz
+   GIT_REPOSITORY https://github.com/codeplaysoftware/oneapi-construction-kit.git
+   #  commit 63f4ba99fc758ffc4268a807b21816b6be1b1b68
+   #  Author: PietroGhg <38155419+PietroGhg@users.noreply.github.com>
+   #  Date:   Wed Jan 31 14:06:36 2024 +0100
+   #  Merge pull request #326 from PietroGhg/pietro/vecz
+   #  Integrate vecz in experimental branch
+   GIT_TAG 63f4ba99fc758ffc4268a807b21816b6be1b1b68
   )
   FetchContent_GetProperties(oneapi-ck)
   if(NOT oneapi-ck_POPULATED)

From db4612eba52a525228ee8523034bd5380f160c9c Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Wed, 31 Jan 2024 13:36:51 +0000
Subject: [PATCH 30/43] Formatting

---
 clang/lib/CodeGen/BackendUtil.cpp             |  9 ++---
 .../llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h     |  3 +-
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 36 ++++++++++---------
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 26 +++++++-------
 sycl/test/native_cpu/link-noinline.cpp        | 30 ++++++++--------
 5 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 0679dc31590b1..8ca9acf58a108 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -127,9 +127,9 @@ static cl::opt<bool> SYCLNativeCPUBackend(
     cl::desc("Run the backend passes for SYCL Native CPU"));
 } // namespace llvm
 
-static cl::opt<bool> SYCLNativeCPUNoVecz(
-    "sycl-native-cpu-no-vecz", cl::init(false),
-    cl::desc("Disable vectorizer for SYCL Native CPU"));
+static cl::opt<bool>
+    SYCLNativeCPUNoVecz("sycl-native-cpu-no-vecz", cl::init(false),
+                        cl::desc("Disable vectorizer for SYCL Native CPU"));
 
 namespace {
 
@@ -1082,7 +1082,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (SYCLNativeCPUBackend) {
-      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM, Level.getSpeedupLevel(), SYCLNativeCPUNoVecz);
+      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
+          MPM, MAM, Level.getSpeedupLevel(), SYCLNativeCPUNoVecz);
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
index 2df44578f476f..511a62597aaaf 100644
--- a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
@@ -18,7 +18,8 @@ namespace sycl {
 namespace utils {
 
 void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
-                                   ModuleAnalysisManager &MAM, unsigned OptLevel, bool DisableVecz);
+                                   ModuleAnalysisManager &MAM,
+                                   unsigned OptLevel, bool DisableVecz);
 const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id";
 const constexpr char NativeCPUGlobaRange[] =
     "__dpcpp_nativecpu_get_global_range";
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 4e43187f8e833..f2f922a2a86fa 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -11,42 +11,46 @@
 // When NATIVECPU_USE_OCK is set, adds passes from the oneAPI Construction Kit.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Passes/PassBuilder.h"
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
-#include "llvm/Passes/PassBuilder.h"
 #include "llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h"
 
 #ifdef NATIVECPU_USE_OCK
 #include "compiler/utils/builtin_info.h"
 #include "compiler/utils/device_info.h"
+#include "compiler/utils/prepare_barriers_pass.h"
 #include "compiler/utils/sub_group_analysis.h"
 #include "compiler/utils/work_item_loops_pass.h"
 #include "vecz/pass.h"
 #include "vecz/vecz_target_info.h"
-#include "compiler/utils/prepare_barriers_pass.h"
-#include "compiler/utils/sub_group_analysis.h"
-#include "compiler/utils/work_item_loops_pass.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #endif
 
 using namespace llvm;
 using namespace sycl::utils;
 
-cl::opt<bool> ForceNoTail("native-cpu-force-no-tail", cl::init(false),
-    cl::desc("Never emit the peeling loop for vectorized kernels,"
-    "even when the local size is not known to be a multiple of the vector width"));
+cl::opt<bool>
+    ForceNoTail("native-cpu-force-no-tail", cl::init(false),
+                cl::desc("Never emit the peeling loop for vectorized kernels,"
+                         "even when the local size is not known to be a "
+                         "multiple of the vector width"));
 
-cl::opt<bool> IsDebug("native-cpu-debug", cl::init(false),
+cl::opt<bool> IsDebug(
+    "native-cpu-debug", cl::init(false),
     cl::desc("Emit extra alloca instructions to preserve the value of live"
-    "variables between barriers"));
-cl::opt<unsigned> NativeCPUVeczWidth("ncpu-vecz-width", cl::init(8), cl::desc("Vector width for SYCL Native CPU vectorizer, defaults to 8"));
-void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &MPM,
-                                   ModuleAnalysisManager &MAM, unsigned OptLevel, bool DisableVecz) {
+             "variables between barriers"));
+cl::opt<unsigned> NativeCPUVeczWidth(
+    "ncpu-vecz-width", cl::init(8),
+    cl::desc("Vector width for SYCL Native CPU vectorizer, defaults to 8"));
+void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
+    llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM, unsigned OptLevel,
+    bool DisableVecz) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
   // Always enable vectorizer, unless explictly disabled or -O0 is set.
-  if(OptLevel != 0 && !DisableVecz) {
+  if (OptLevel != 0 && !DisableVecz) {
     MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
     MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });
     auto queryFunc =
@@ -78,11 +82,11 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(llvm::ModulePassManager &M
 
   // Run optimization passes after all the changes we made to the kernels.
   // Todo: check optimization level from clang
-  // Todo: maybe we could find a set of relevant passes instead of re-running the full 
-  // optimization pipeline.
+  // Todo: maybe we could find a set of relevant passes instead of re-running
+  // the full optimization pipeline.
   PassBuilder PB;
   OptimizationLevel Level;
-  switch(OptLevel) {
+  switch (OptLevel) {
   case 0:
     Level = OptimizationLevel::O0;
     break;
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index 581e322494d59..6eb76d3f26bc4 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -308,7 +308,7 @@ static Function *getReplaceFunc(Module &M, StringRef Name, Type *StateType) {
 }
 
 static Value *getStateArg(Function *F, llvm::Constant *StateTLS) {
-  // Todo: we should probably cache the state thread local load here 
+  // Todo: we should probably cache the state thread local load here
   // to avoid re-emitting it for each builtin
   if (StateTLS) {
     IRBuilder<> BB(&*F->getEntryBlock().getFirstInsertionPt());
@@ -347,12 +347,12 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
   CurrentStatePointerTLS = nullptr;
 
   // check if any of the kernels is called by some other function.
-  // This can happen e.g. with OCK, where wrapper functions are 
+  // This can happen e.g. with OCK, where wrapper functions are
   // created around the original kernel.
   bool KernelIsCalled = false;
-  for(auto& K : OldKernels) {
-    for(auto& U : K->uses()){
-      if(isa<CallBase>(U.getUser())) {
+  for (auto &K : OldKernels) {
+    for (auto &U : K->uses()) {
+      if (isa<CallBase>(U.getUser())) {
         KernelIsCalled = true;
       }
     }
@@ -395,18 +395,18 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
 #ifdef NATIVECPU_USE_OCK
     auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
     OldF->setName(Name);
-    // if vectorization occurred, at this point we have a wrapper function that 
-    // runs the vectorized kernel and peels using the scalar kernel. We make it so
-    // this wrapper steals the original kernel name.
-    std::optional<compiler::utils::LinkMetadataResult> veczR = compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
-    if(veczR) {
+    // if vectorization occurred, at this point we have a wrapper function that
+    // runs the vectorized kernel and peels using the scalar kernel. We make it
+    // so this wrapper steals the original kernel name.
+    std::optional<compiler::utils::LinkMetadataResult> veczR =
+        compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
+    if (veczR) {
       auto ScalarF = veczR.value().first;
       OldF->takeName(ScalarF);
       ScalarF->setName(OldF->getName() + "_scalar");
-    }
-    else if(Name != OldF->getName()) {
+    } else if (Name != OldF->getName()) {
       auto RealKernel = M.getFunction(Name);
-      if(RealKernel) {
+      if (RealKernel) {
         // the real kernel was not inlined in the wrapper, steal its name
         OldF->takeName(RealKernel);
       } else {
diff --git a/sycl/test/native_cpu/link-noinline.cpp b/sycl/test/native_cpu/link-noinline.cpp
index ca26b808b776a..763f93312e4a3 100644
--- a/sycl/test/native_cpu/link-noinline.cpp
+++ b/sycl/test/native_cpu/link-noinline.cpp
@@ -60,21 +60,21 @@ int main() {
   sycl::queue deviceQueue;
   sycl::range<1> numOfItems{N};
   {
-  sycl::buffer<int, 1> bufferC(C.data(), numOfItems);
+    sycl::buffer<int, 1> bufferC(C.data(), numOfItems);
 
-  if (test_all(HOST_RET) != HOST_RET)
-    return 1;
+    if (test_all(HOST_RET) != HOST_RET)
+      return 1;
 
-  deviceQueue
-      .submit([&](sycl::handler &cgh) {
-        auto accessorC = bufferC.get_access<sycl_write>(cgh);
+    deviceQueue
+        .submit([&](sycl::handler &cgh) {
+          auto accessorC = bufferC.get_access<sycl_write>(cgh);
 
-        auto kern = [=](sycl::id<1> wiID) {
-          accessorC[wiID] = test_all(DEVICE_RET);
-        };
-        cgh.parallel_for<class SimpleVadd>(numOfItems, kern);
-      })
-      .wait();
+          auto kern = [=](sycl::id<1> wiID) {
+            accessorC[wiID] = test_all(DEVICE_RET);
+          };
+          cgh.parallel_for<class SimpleVadd>(numOfItems, kern);
+        })
+        .wait();
   }
 
   bool pass = true;
@@ -85,9 +85,9 @@ int main() {
       pass = false;
     }
   }
-  if(pass) {
-  std::cout << "The results are correct!\n";
-  return 0;
+  if (pass) {
+    std::cout << "The results are correct!\n";
+    return 0;
   }
   return 2;
 }

From 69c83b69911c4bdedef145edfe13960fdcb772e8 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 2 Feb 2024 14:27:18 +0000
Subject: [PATCH 31/43] Change vecz width option name and location

---
 clang/lib/CodeGen/BackendUtil.cpp             |  8 ++------
 .../llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h     |  2 +-
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 20 ++++++++++++-------
 .../native_cpu/vectorization.cpp              |  4 ++--
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 8ca9acf58a108..c188b95c28e08 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -127,10 +127,6 @@ static cl::opt<bool> SYCLNativeCPUBackend(
     cl::desc("Run the backend passes for SYCL Native CPU"));
 } // namespace llvm
 
-static cl::opt<bool>
-    SYCLNativeCPUNoVecz("sycl-native-cpu-no-vecz", cl::init(false),
-                        cl::desc("Disable vectorizer for SYCL Native CPU"));
-
 namespace {
 
 // Default filename used for profile generation.
@@ -1082,8 +1078,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (SYCLNativeCPUBackend) {
-      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
-          MPM, MAM, Level.getSpeedupLevel(), SYCLNativeCPUNoVecz);
+      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM,
+                                                       Level.getSpeedupLevel());
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
index 511a62597aaaf..64050b41f37c8 100644
--- a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
@@ -19,7 +19,7 @@ namespace utils {
 
 void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
                                    ModuleAnalysisManager &MAM,
-                                   unsigned OptLevel, bool DisableVecz);
+                                   unsigned OptLevel);
 const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id";
 const constexpr char NativeCPUGlobaRange[] =
     "__dpcpp_nativecpu_get_global_range";
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index f2f922a2a86fa..dd51d83875718 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -31,26 +31,32 @@
 using namespace llvm;
 using namespace sycl::utils;
 
-cl::opt<bool>
+static cl::opt<bool>
     ForceNoTail("native-cpu-force-no-tail", cl::init(false),
                 cl::desc("Never emit the peeling loop for vectorized kernels,"
                          "even when the local size is not known to be a "
                          "multiple of the vector width"));
 
-cl::opt<bool> IsDebug(
+static cl::opt<bool> IsDebug(
     "native-cpu-debug", cl::init(false),
     cl::desc("Emit extra alloca instructions to preserve the value of live"
              "variables between barriers"));
-cl::opt<unsigned> NativeCPUVeczWidth(
-    "ncpu-vecz-width", cl::init(8),
+
+static cl::opt<unsigned> NativeCPUVeczWidth(
+    "sycl-native-cpu-vecz-width", cl::init(8),
     cl::desc("Vector width for SYCL Native CPU vectorizer, defaults to 8"));
+
+static cl::opt<bool>
+    SYCLNativeCPUNoVecz("sycl-native-cpu-no-vecz", cl::init(false),
+                        cl::desc("Disable vectorizer for SYCL Native CPU"));
+
 void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
-    llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM, unsigned OptLevel,
-    bool DisableVecz) {
+    llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM,
+    unsigned OptLevel) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
   // Always enable vectorizer, unless explictly disabled or -O0 is set.
-  if (OptLevel != 0 && !DisableVecz) {
+  if (OptLevel != 0 && !SYCLNativeCPUNoVecz) {
     MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
     MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });
     auto queryFunc =
diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp
index 7532b5e38fe54..2bbbbeb426db2 100644
--- a/sycl/test/check_device_code/native_cpu/vectorization.cpp
+++ b/sycl/test/check_device_code/native_cpu/vectorization.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
 // RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DEFAULT
-// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -ncpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16
-// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -ncpu-vecz-width=4 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-4
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16
+// RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=4 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-4
 // RUN: %clangxx -O0 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-O0
 // RUN: %clangxx -fsycl -fsycl-targets=native_cpu -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-no-vecz -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DISABLE
 #include <sycl/sycl.hpp>

From 4abfb0e682c08c79de9b0f1c614115f4665bd0a1 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 2 Feb 2024 15:09:21 +0000
Subject: [PATCH 32/43] Update docs

---
 sycl/doc/design/SYCLNativeCPU.md | 53 ++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index 89ced50c64c39..46121c8eb18e1 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -69,11 +69,18 @@ cmake \
 
 Note that a number of `e2e` tests are currently still failing.
 
+# Vectorization
+
+With the integration of the OneAPI Construction Kit, the SYCL Native CPU target gained support for Whole Function Vectorization.\\
+Whole Function Vectorization is enabled by default, and can be controlled through these compiler options:
+* `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
+* `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.
+For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#Technical details) section.
+
 ## Ongoing work
 
 * Complete support for remaining SYCL features, including but not limited to
   * math and other builtins
-* Vectorization (e.g. Whole Function Vectorization)
 * Subgroup support
 * Performance optimizations
 
@@ -175,6 +182,48 @@ As you can see, the `subhandler` steals the kernel's function name, and receives
 
 On SYCL Native CPU, calls to `__spirv_ControlBarrier` are handled using the `WorkItemLoopsPass` from the oneAPI Construction Kit. This pass handles barriers by splitting the kernel between calls calls to `__spirv_ControlBarrier`, and creating a wrapper that runs the subkernels over the local range. In order to correctly interface to the oneAPI Construction Kit pass pipeline, SPIRV builtins are converted to `mux` builtins (used by the OCK) by the `ConvertToMuxBuiltinsSYCLNativeCPUPass`.
 
+## Vectorization
+
+The OneAPI Construction Kit's Whole Function Vectorizer is executed as an LLVM Pass. Considering the following input function:
+
+```llvm
+define void @SimpleVadd(i32*, i32*, i32*) {
+  %5 = call i64 @_Z13get_global_idj(i32 0)
+  %6 = getelementptr inbounds i32, ptr %1, i64 %5
+  %7 = load i32, ptr %6, align 4
+  %8 = getelementptr inbounds i32, ptr %2, i64 %5
+  %9 = load i32, ptr %8, align 4
+  %10 = add nsw i32 %9, %7
+  %11 = getelementptr inbounds i32, ptr %0, i64 %5
+  store i32 %10, ptr %11, align 4
+  ret void
+}
+```
+
+With a vector width of 8, the vectorizer will produce:
+
+```llvm
+define void @__vecz_v8_SimpleVadd(i32*, i32*, i32*) !codeplay_ca_vecz.derived !2 {
+  %5 = call i64 @_Z13get_global_idj(i32 0)
+  %6 = getelementptr inbounds i32, ptr %1, i64 %5
+  %7 = load <8 x i32>, ptr %6, align 4
+  %8 = getelementptr inbounds i32, ptr %2, i64 %5
+  %9 = load <8 x i32>, ptr %8, align 4
+  %10 = add nsw <8 x i32> %9, %7
+  %11 = getelementptr inbounds i32, ptr %0, i64 %5
+  store <8 x i32> %12, ptr %11, align 4
+  ret void
+}
+!1 = !{i32 8, i32 0, i32 0, i32 0}
+!2 = !{!1, ptr @_ZTSN4sycl3_V16detail19__pf_kernel_wrapperI10SimpleVaddEE}
+```
+
+The `__vecz_v8_SimpleVadd` function is the vectorized version of the original function. It receives arguments of the same type,
+and has the `codeplay_ca_vecz.derived` metadata node attached. The metadata node contains information about the vectorization width,
+and points to the original version of the function. This information is used later in the pass pipeline by the `WorkItemLoopsPass`,
+which will account for the vectorization when creating the Work Item Loops, and use the original version of the function to add
+peeling loops.
+
 ## Kernel registration
 
 In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the SYCL Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array.
@@ -189,7 +238,7 @@ In order to register the SYCL Native CPU kernels to the SYCL runtime, we applied
     BinaryStart                         BinaryEnd  
 ```
 
-Each entry in the array contains the kernel name as a string, and a pointer to the `sunhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module.
+Each entry in the array contains the kernel name as a string, and a pointer to the `subhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module.
 
 ## Kernel lowering and execution
 

From 9ae8a2138a8376fb88f0529fcc7ef27eab6cd95f Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 2 Feb 2024 15:19:05 +0000
Subject: [PATCH 33/43] Update docs

---
 sycl/doc/design/SYCLNativeCPU.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index 46121c8eb18e1..d0f761df48530 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -74,7 +74,7 @@ Note that a number of `e2e` tests are currently still failing.
 With the integration of the OneAPI Construction Kit, the SYCL Native CPU target gained support for Whole Function Vectorization.\\
 Whole Function Vectorization is enabled by default, and can be controlled through these compiler options:
 * `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
-* `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.
+* `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.\\
 For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#Technical details) section.
 
 ## Ongoing work

From 05ae105c2c41c5516ed72e066c5157c3766255bb Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 2 Feb 2024 15:20:22 +0000
Subject: [PATCH 34/43] New line in docs

---
 sycl/doc/design/SYCLNativeCPU.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index d0f761df48530..c6be32f663de9 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -74,7 +74,8 @@ Note that a number of `e2e` tests are currently still failing.
 With the integration of the OneAPI Construction Kit, the SYCL Native CPU target gained support for Whole Function Vectorization.\\
 Whole Function Vectorization is enabled by default, and can be controlled through these compiler options:
 * `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
-* `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.\\
+* `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.
+
 For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#Technical details) section.
 
 ## Ongoing work
@@ -176,6 +177,7 @@ entry:
   ret void
 }
 ```
+
 As you can see, the `subhandler` steals the kernel's function name, and receives two pointer arguments: the first one points to the kernel arguments from the SYCL runtime, and the second one to the `__nativecpu_state` struct.
 
 ## Handling barriers 

From 432681ec881985109e336a901a1e09ed0d1e2f55 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 2 Feb 2024 15:21:50 +0000
Subject: [PATCH 35/43] Link to section

---
 sycl/doc/design/SYCLNativeCPU.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index c6be32f663de9..bd349486e3306 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -76,7 +76,7 @@ Whole Function Vectorization is enabled by default, and can be controlled throug
 * `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
 * `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.
 
-For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#Technical details) section.
+For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#Technical-details) section.
 
 ## Ongoing work
 

From fc6e1d338be955e951eeaf18a13b5573715681c4 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 2 Feb 2024 15:22:51 +0000
Subject: [PATCH 36/43] Link to section

---
 sycl/doc/design/SYCLNativeCPU.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index bd349486e3306..29e4f6f92c830 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -76,7 +76,7 @@ Whole Function Vectorization is enabled by default, and can be controlled throug
 * `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
 * `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.
 
-For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#Technical-details) section.
+For more details on how the Whole Function Vectorizer is integrated for SYCL Native CPU, refer to the [Technical details[(#technical-details) section.
 
 ## Ongoing work
 

From 94d849cc9a71c33561110dae11ed04cc82d5f32d Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Mon, 5 Feb 2024 13:29:05 +0000
Subject: [PATCH 37/43] Update lit tests

---
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp         |  8 ++++----
 .../native_cpu/native_cpu_builtins.cpp                 | 10 +++++-----
 .../check_device_code/native_cpu/vectorization.cpp     |  3 ++-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index dd51d83875718..34d912c35b471 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -59,7 +59,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
   if (OptLevel != 0 && !SYCLNativeCPUNoVecz) {
     MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
     MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });
-    auto queryFunc =
+    auto QueryFunc =
         [](const llvm::Function &F, const llvm::ModuleAnalysisManager &,
            llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) -> bool {
       if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
@@ -71,14 +71,14 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
       Opts.emplace_back(std::move(VPO));
       return true;
     };
-    MAM.registerPass([&] { return vecz::VeczPassOptionsAnalysis(queryFunc); });
+    MAM.registerPass([QueryFunc] { return vecz::VeczPassOptionsAnalysis(QueryFunc); });
     MPM.addPass(vecz::RunVeczPass());
   }
   compiler::utils::WorkItemLoopsPassOptions Opts;
   Opts.IsDebug = IsDebug;
   Opts.ForceNoTail = ForceNoTail;
-  MAM.registerPass([&] { return compiler::utils::BuiltinInfoAnalysis(); });
-  MAM.registerPass([&] { return compiler::utils::SubgroupAnalysis(); });
+  MAM.registerPass([] { return compiler::utils::BuiltinInfoAnalysis(); });
+  MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); });
   MPM.addPass(compiler::utils::PrepareBarriersPass());
   MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
   MPM.addPass(AlwaysInlinerPass());
diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
index 741e8b6afa7e4..783fd6a30a7e1 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
@@ -1,17 +1,17 @@
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o %t_temp.ll %s
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
 
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm %s -o %t_temp.ll
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s
 
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -fno-inline -S -emit-llvm %s -o %t_temp.ll
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
 
 // Check that builtins are emitted as expected
-// RUN: %clangxx -mllvm -sycl-native-cpu-backend -O0 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-BT
+// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-BT
 
 // check that we added the state struct as a function argument, and that we
 // inject the calls to our builtins.
diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp
index 2bbbbeb426db2..d1d30de5574b2 100644
--- a/sycl/test/check_device_code/native_cpu/vectorization.cpp
+++ b/sycl/test/check_device_code/native_cpu/vectorization.cpp
@@ -16,7 +16,8 @@ int main() {
     // CHECK-16: store <16 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
     // CHECK-4: store <4 x i32> <i32 42, i32 42, i32 42, i32 42>
     // CHECK-O0: store i32 42
+    // CHECK-O0-NOT: store <{{.*}}>
     // CHECK-DISABLE: store i32 42
-    // CHECK-DISABLE-NOT: store <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+    // CHECK-DISABLE-NOT: store <{{.*}}>
   });
 }

From 263d58faa87eb8cb25eada611355045b174c16f5 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Tue, 6 Feb 2024 13:18:10 +0000
Subject: [PATCH 38/43] Use llvm::OptimiaztionLevel

---
 clang/lib/CodeGen/BackendUtil.cpp             |  3 +-
 .../llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h     |  3 +-
 .../lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 28 ++++---------------
 3 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index d1b526424d9c5..c24d47e790906 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -1080,7 +1081,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
 
     if (SYCLNativeCPUBackend) {
       llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM,
-                                                       Level.getSpeedupLevel());
+                                                       Level);
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
diff --git a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
index 64050b41f37c8..404b9ffced6c4 100644
--- a/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
+++ b/llvm/include/llvm/SYCLLowerIR/UtilsSYCLNativeCPU.h
@@ -12,6 +12,7 @@
 #pragma once
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Passes/OptimizationLevel.h"
 
 namespace llvm {
 namespace sycl {
@@ -19,7 +20,7 @@ namespace utils {
 
 void addSYCLNativeCPUBackendPasses(ModulePassManager &MPM,
                                    ModuleAnalysisManager &MAM,
-                                   unsigned OptLevel);
+                                   OptimizationLevel OptLevel);
 const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_get_global_id";
 const constexpr char NativeCPUGlobaRange[] =
     "__dpcpp_nativecpu_get_global_range";
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index 34d912c35b471..d5012e90fec96 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -11,6 +11,7 @@
 // When NATIVECPU_USE_OCK is set, adds passes from the oneAPI Construction Kit.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/SYCLLowerIR/ConvertToMuxBuiltinsSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
@@ -52,13 +53,13 @@ static cl::opt<bool>
 
 void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
     llvm::ModulePassManager &MPM, ModuleAnalysisManager &MAM,
-    unsigned OptLevel) {
+    OptimizationLevel OptLevel) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
   // Always enable vectorizer, unless explictly disabled or -O0 is set.
-  if (OptLevel != 0 && !SYCLNativeCPUNoVecz) {
-    MAM.registerPass([&] { return vecz::TargetInfoAnalysis(); });
-    MAM.registerPass([&] { return compiler::utils::DeviceInfoAnalysis(); });
+  if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) {
+    MAM.registerPass([] { return vecz::TargetInfoAnalysis(); });
+    MAM.registerPass([] { return compiler::utils::DeviceInfoAnalysis(); });
     auto QueryFunc =
         [](const llvm::Function &F, const llvm::ModuleAnalysisManager &,
            llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) -> bool {
@@ -91,22 +92,5 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
   // Todo: maybe we could find a set of relevant passes instead of re-running
   // the full optimization pipeline.
   PassBuilder PB;
-  OptimizationLevel Level;
-  switch (OptLevel) {
-  case 0:
-    Level = OptimizationLevel::O0;
-    break;
-  case 1:
-    Level = OptimizationLevel::O1;
-    break;
-  case 2:
-    Level = OptimizationLevel::O2;
-    break;
-  case 3:
-    Level = OptimizationLevel::O3;
-    break;
-  default:
-    llvm_unreachable("Unsupported opt level");
-  }
-  MPM.addPass(PB.buildPerModuleDefaultPipeline(Level));
+  MPM.addPass(PB.buildPerModuleDefaultPipeline(OptLevel));
 }

From cf584d23f2d3465984c57fd8b23fae1af05a8798 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Tue, 6 Feb 2024 13:25:57 +0000
Subject: [PATCH 39/43] Updated vector add test

---
 sycl/test/native_cpu/vector-add.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sycl/test/native_cpu/vector-add.cpp b/sycl/test/native_cpu/vector-add.cpp
index 79940e38bc061..d8f851a0cfb04 100644
--- a/sycl/test/native_cpu/vector-add.cpp
+++ b/sycl/test/native_cpu/vector-add.cpp
@@ -6,6 +6,11 @@
 // RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -g -o %t-debug
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t-debug
 
+// Test with vector width set manually, this ensures that we peel correctly when doing
+// vectorization.
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -mllvm -sycl-native-cpu-vecz-width=4 %s -g -o %t-vec
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t-vec
+
 #include <sycl/sycl.hpp>
 
 #include <array>
@@ -17,8 +22,9 @@ constexpr sycl::access::mode sycl_write = sycl::access::mode::write;
 class SimpleVadd;
 
 int main() {
-  const size_t N = 4;
-  std::array<int, N> A = {{1, 2, 3, 4}}, B = {{2, 3, 4, 5}}, C{{0, 0, 0, 0}};
+  const size_t N = 5;
+  std::array<int, N> A = {{1, 2, 3, 4, 5}}, B = {{2, 3, 4, 5, 6}},
+    C{{0, 0, 0, 0, 0}};
   sycl::queue deviceQueue;
   sycl::range<1> numOfItems{N};
   sycl::buffer<int, 1> bufferA(A.data(), numOfItems);

From 9ad5e36f48a5e9e8d38547519831e9688d0042a9 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Wed, 7 Feb 2024 11:55:47 +0000
Subject: [PATCH 40/43] Update docs

---
 sycl/doc/design/SYCLNativeCPU.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index 29e4f6f92c830..91ad393df7f62 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -71,7 +71,8 @@ Note that a number of `e2e` tests are currently still failing.
 
 # Vectorization
 
-With the integration of the OneAPI Construction Kit, the SYCL Native CPU target gained support for Whole Function Vectorization.\\
+With the integration of the OneAPI Construction Kit, the SYCL Native CPU target
+also gained support for Whole Function Vectorization.\\
 Whole Function Vectorization is enabled by default, and can be controlled through these compiler options:
 * `-mllvm -sycl-native-cpu-no-vecz`: disable Whole Function Vectorization.
 * `-mllvm -sycl-native-cpu-vecz-width`: sets the vector width to the specified value, defaults to 8.

From 7349939aadbfc80f4c368ed9f6f2c1143a8c67aa Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Thu, 8 Feb 2024 10:44:13 +0000
Subject: [PATCH 41/43] Formatting

---
 clang/lib/CodeGen/BackendUtil.cpp              | 3 +--
 llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp | 3 ++-
 sycl/test/native_cpu/vector-add.cpp            | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index c24d47e790906..d7e49822bf58b 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1080,8 +1080,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (SYCLNativeCPUBackend) {
-      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM,
-                                                       Level);
+      llvm::sycl::utils::addSYCLNativeCPUBackendPasses(MPM, MAM, Level);
     }
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
diff --git a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
index d5012e90fec96..25f50ebd969a1 100644
--- a/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PipelineSYCLNativeCPU.cpp
@@ -72,7 +72,8 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
       Opts.emplace_back(std::move(VPO));
       return true;
     };
-    MAM.registerPass([QueryFunc] { return vecz::VeczPassOptionsAnalysis(QueryFunc); });
+    MAM.registerPass(
+        [QueryFunc] { return vecz::VeczPassOptionsAnalysis(QueryFunc); });
     MPM.addPass(vecz::RunVeczPass());
   }
   compiler::utils::WorkItemLoopsPassOptions Opts;
diff --git a/sycl/test/native_cpu/vector-add.cpp b/sycl/test/native_cpu/vector-add.cpp
index d8f851a0cfb04..7ac66c1852285 100644
--- a/sycl/test/native_cpu/vector-add.cpp
+++ b/sycl/test/native_cpu/vector-add.cpp
@@ -6,8 +6,8 @@
 // RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -g -o %t-debug
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t-debug
 
-// Test with vector width set manually, this ensures that we peel correctly when doing
-// vectorization.
+// Test with vector width set manually, this ensures that we peel correctly when
+// doing vectorization.
 // RUN: %clangxx -fsycl -fsycl-targets=native_cpu -mllvm -sycl-native-cpu-vecz-width=4 %s -g -o %t-vec
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t-vec
 
@@ -24,7 +24,7 @@ class SimpleVadd;
 int main() {
   const size_t N = 5;
   std::array<int, N> A = {{1, 2, 3, 4, 5}}, B = {{2, 3, 4, 5, 6}},
-    C{{0, 0, 0, 0, 0}};
+                     C{{0, 0, 0, 0, 0}};
   sycl::queue deviceQueue;
   sycl::range<1> numOfItems{N};
   sycl::buffer<int, 1> bufferA(A.data(), numOfItems);

From 5957645b9c60fada376dbc879570b06d2212ee37 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Thu, 8 Feb 2024 11:32:35 +0000
Subject: [PATCH 42/43] Mark vectorization test are require native_cpu_be

---
 sycl/test/check_device_code/native_cpu/vectorization.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test/check_device_code/native_cpu/vectorization.cpp b/sycl/test/check_device_code/native_cpu/vectorization.cpp
index d1d30de5574b2..a6b5211bff4f7 100644
--- a/sycl/test/check_device_code/native_cpu/vectorization.cpp
+++ b/sycl/test/check_device_code/native_cpu/vectorization.cpp
@@ -1,3 +1,4 @@
+// REQUIRES: native_cpu_be
 // RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -S -emit-llvm  -o %t_temp.ll %s
 // RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-DEFAULT
 // RUN: %clangxx -O2 -mllvm -sycl-native-cpu-backend -mllvm -sycl-native-cpu-vecz-width=16 -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-16

From 655afa6b0cd5300abf3ffbd33c34a708f203fda0 Mon Sep 17 00:00:00 2001
From: PietroGhg <pietro.ghiglio@codeplay.com>
Date: Fri, 9 Feb 2024 13:25:13 +0000
Subject: [PATCH 43/43] Remove unnecessary include

---
 clang/lib/CodeGen/BackendUtil.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index d7e49822bf58b..1c8d5d6b7e479 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"