Skip to content

Commit

Permalink
[SYCL] [NATIVECPU] Remove dependencies to sycl in UR adapter (intel#1…
Browse files Browse the repository at this point in the history
…1685)

This PR removes dependencies to the sycl headers/library in the Native
CPU UR adapter.
`sycl/include/sycl/detail/native_cpu.hpp` has been moved to
`sycl/plugins/unified_runtime/ur/adapters/native_cpu/nativecpu_state.hpp`,
and the definitions of work item builtins has been moved from that
header to the compiler, which now emits them in the
`PrepareSYCLNativeCPUPass`.
  • Loading branch information
PietroGhg authored Nov 1, 2023
1 parent 27db767 commit 986a7ec
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 187 deletions.
16 changes: 1 addition & 15 deletions clang/test/CodeGenSYCL/native_cpu_basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,7 @@
// RUN: FileCheck -input-file=%t.ll %s

#include "sycl.hpp"
typedef __typeof__(sizeof(int)) size_t;
struct __nativecpu_state {
alignas(16) size_t MGlobal_id[3];
};
#define __SYCL_HC_ATTRS \
__attribute__((weak)) __attribute((alwaysinline)) \
[[intel::device_indirectly_callable]]

extern "C" __SYCL_HC_ATTRS __attribute((address_space(0))) size_t *
__dpcpp_nativecpu_global_id(__attribute((address_space(0)))
__nativecpu_state *s) {
return &(s->MGlobal_id[0]);
}


typedef long unsigned int size_t;

using namespace sycl;
const size_t N = 10;
Expand Down
95 changes: 81 additions & 14 deletions llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
//===----------------------------------------------------------------------===//

#include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
#include "llvm/BinaryFormat/MsgPack.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/PassManager.h"
#include "llvm/SYCLLowerIR/SYCLUtils.h"
Expand Down Expand Up @@ -49,6 +51,17 @@
using namespace llvm;

namespace {
static const constexpr char NativeCPUGlobalId[] = "__dpcpp_nativecpu_global_id";
static const constexpr char NativeCPUGlobaRange[] =
"__dpcpp_nativecpu_global_range";
static const constexpr char NativeCPUGlobalOffset[] =
"__dpcpp_nativecpu_get_global_offset";
static const constexpr char NativeCPULocalId[] =
"__dpcpp_nativecpu_get_local_id";
static const constexpr char NativeCPUNumGroups[] =
"__dpcpp_nativecpu_get_num_groups";
static const constexpr char NativeCPUWGSize[] = "__dpcpp_nativecpu_get_wg_size";
static const constexpr char NativeCPUWGId[] = "__dpcpp_nativecpu_get_wg_id";

void fixCallingConv(Function *F) {
F->setCallingConv(llvm::CallingConv::C);
Expand Down Expand Up @@ -223,30 +236,84 @@ Function *cloneFunctionAndAddParam(Function *OldF, Type *T,
} \
}
#define GEN_xyz(b_name, len, ncpu_name) \
GEN_p(#b_name "_x", len, #ncpu_name, 0), \
GEN_p(#b_name "_y", len, #ncpu_name, 1), \
GEN_p(#b_name "_z", len, #ncpu_name, 2)
GEN_p(#b_name "_x", len, ncpu_name, 0), \
GEN_p(#b_name "_y", len, ncpu_name, 1), \
GEN_p(#b_name "_z", len, ncpu_name, 2)

// Todo: add support for more SPIRV builtins here
static const std::pair<std::pair<StringRef, StringRef>,
std::pair<StringRef, unsigned int>>
BuiltinNamesMap[] = {
GEN_xyz(__spirv_GlobalInvocationId, 28, __dpcpp_nativecpu_global_id),
GEN_xyz(__spirv_GlobalSize, 20, __dpcpp_nativecpu_global_range),
GEN_xyz(__spirv_GlobalOffset, 22, __dpcpp_nativecpu_get_global_offset),
GEN_xyz(__spirv_LocalInvocationId, 27, __dpcpp_nativecpu_get_local_id),
GEN_xyz(__spirv_NumWorkgroups, 23, __dpcpp_nativecpu_get_num_groups),
GEN_xyz(__spirv_WorkgroupSize, 23, __dpcpp_nativecpu_get_wg_size),
GEN_xyz(__spirv_WorkgroupId, 21, __dpcpp_nativecpu_get_wg_id),
GEN_xyz(__spirv_GlobalInvocationId, 28, NativeCPUGlobalId),
GEN_xyz(__spirv_GlobalSize, 20, NativeCPUGlobaRange),
GEN_xyz(__spirv_GlobalOffset, 22, NativeCPUGlobalOffset),
GEN_xyz(__spirv_LocalInvocationId, 27, NativeCPULocalId),
GEN_xyz(__spirv_NumWorkgroups, 23, NativeCPUNumGroups),
GEN_xyz(__spirv_WorkgroupSize, 23, NativeCPUWGSize),
GEN_xyz(__spirv_WorkgroupId, 21, NativeCPUWGId),
};

static inline bool IsForVisualStudio(StringRef triple_str) {
llvm::Triple triple(triple_str);
return triple.isKnownWindowsMSVCEnvironment();
}

static Function *getReplaceFunc(const Module &M, StringRef Name) {
static constexpr unsigned int NativeCPUGlobalAS = 1;
static constexpr char StateTypeName[] = "struct.__nativecpu_state";

static Type *getStateType(Module &M) {
// %struct.__nativecpu_state = type { [3 x i64], [3 x i64], [3 x i64], [3 x
// i64], [3 x i64], [3 x i64], [3 x i64] } Check that there's no
// __nativecpu_state type
auto Types = M.getIdentifiedStructTypes();
bool HasStateT =
llvm::any_of(Types, [](auto T) { return T->getName() == StateTypeName; });
if (HasStateT)
report_fatal_error("Native CPU state unexpectedly found in the module.");
auto &Ctx = M.getContext();
auto *I64Ty = Type::getInt64Ty(Ctx);
auto *Array3dTy = ArrayType::get(I64Ty, 3);
std::array<Type *, 7> Elements;
Elements.fill(Array3dTy);
auto *StateType = StructType::create(Ctx, StateTypeName);
StateType->setBody(Elements);
return StateType;
}

static const StringMap<unsigned> OffsetMap{
{NativeCPUGlobalId, 0}, {NativeCPUGlobaRange, 1},
{NativeCPUWGSize, 2}, {NativeCPUWGId, 3},
{NativeCPULocalId, 4}, {NativeCPUNumGroups, 5},
{NativeCPUGlobalOffset, 6}};

static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) {
auto &Ctx = M.getContext();
Type *I64Ty = Type::getInt64Ty(Ctx);
Type *I32Ty = Type::getInt32Ty(Ctx);
Type *RetTy = I64Ty;
Type *DimTy = I32Ty;
Type *PtrTy = PointerType::get(Ctx, NativeCPUGlobalAS);
static FunctionType *FTy = FunctionType::get(RetTy, {DimTy, PtrTy}, false);
auto FCallee = M.getOrInsertFunction(Name, FTy);
auto *F = dyn_cast<Function>(FCallee.getCallee());
IRBuilder<> Builder(Ctx);
BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
Builder.SetInsertPoint(BB);
auto *IdxProm = Builder.CreateZExt(F->getArg(0), DimTy, "idxprom");
auto *Zero = ConstantInt::get(I64Ty, 0);
auto *Offset = ConstantInt::get(I32Ty, OffsetMap.at(Name));
auto *GEP =
Builder.CreateGEP(StateType, F->getArg(1), {Zero, Offset, IdxProm});
auto *Load = Builder.CreateLoad(I64Ty, GEP);
Builder.CreateRet(Load);
F->setLinkage(GlobalValue::LinkageTypes::WeakAnyLinkage);
return F;
}

static Function *getReplaceFunc(Module &M, StringRef Name, Type *StateType) {
Function *F = M.getFunction(Name);
if (!F)
return addReplaceFunc(M, Name, StateType);
assert(F && "Error retrieving replace function");
return F;
}
Expand Down Expand Up @@ -280,8 +347,8 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
// Materialize builtins
// First we add a pointer to the Native CPU state as arg to all the
// kernels.
Type *StateType =
StructType::getTypeByName(M.getContext(), "struct.__nativecpu_state");
Type *StateType = getStateType(M);
// Todo: fix this check since we are emitting the state type in the pass now
if (!StateType)
return PreservedAnalyses::all();
Type *StatePtrType = PointerType::get(StateType, 1);
Expand Down Expand Up @@ -349,7 +416,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
SmallVector<Instruction *> ToRemove;
Function *const Glob = Entry.first;
for (const auto &Use : Glob->uses()) {
auto *ReplaceFunc = getReplaceFunc(M, Entry.second.first);
auto *ReplaceFunc = getReplaceFunc(M, Entry.second.first, StateType);
auto I = dyn_cast<CallInst>(Use.getUser());
if (!I)
report_fatal_error("Unsupported Value in SYCL Native CPU\n");
Expand Down
114 changes: 0 additions & 114 deletions sycl/include/sycl/detail/native_cpu.hpp

This file was deleted.

4 changes: 0 additions & 4 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,6 @@
#include <sycl/usm/usm_enums.hpp>
#include <sycl/usm/usm_pointer_info.hpp>

#ifdef __SYCL_NATIVE_CPU__
#include <sycl/detail/native_cpu.hpp>
#endif

#include <assert.h>
#include <functional>
#include <memory>
Expand Down
3 changes: 0 additions & 3 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,9 @@ if("native_cpu" IN_LIST SYCL_ENABLE_PLUGINS)
"ur/adapters/native_cpu/ur_interface_loader.cpp"
"ur/adapters/native_cpu/usm.cpp"
"ur/adapters/native_cpu/usm_p2p.cpp"
INCLUDE_DIRS
${sycl_inc_dir}
LIBRARIES
UnifiedRuntime-Headers
Threads::Threads
sycl
OpenCL-Headers
)

Expand Down
57 changes: 27 additions & 30 deletions sycl/plugins/unified_runtime/ur/adapters/native_cpu/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,37 @@
//
//===----------------------------------------------------------------------===//

#include <sycl/detail/cg_types.hpp>
#include <array>
#include <cstdint>

#include "ur_api.h"

#include "common.hpp"
#include "kernel.hpp"
#include "memory.hpp"

sycl::detail::NDRDescT getNDRDesc(uint32_t WorkDim,
const size_t *GlobalWorkOffset,
const size_t *GlobalWorkSize,
const size_t *LocalWorkSize) {
// Todo: we flip indexes here, I'm not sure we should, if we don't we need to
// un-flip them in the spirv builtins definitions as well
sycl::detail::NDRDescT Res;
switch (WorkDim) {
case 1:
Res.set<1>(sycl::nd_range<1>({GlobalWorkSize[0]}, {LocalWorkSize[0]},
{GlobalWorkOffset[0]}));
break;
case 2:
Res.set<2>(sycl::nd_range<2>({GlobalWorkSize[0], GlobalWorkSize[1]},
{LocalWorkSize[0], LocalWorkSize[1]},
{GlobalWorkOffset[0], GlobalWorkOffset[1]}));
break;
case 3:
Res.set<3>(sycl::nd_range<3>(
{GlobalWorkSize[0], GlobalWorkSize[1], GlobalWorkSize[2]},
{LocalWorkSize[0], LocalWorkSize[1], LocalWorkSize[2]},
{GlobalWorkOffset[0], GlobalWorkOffset[1], GlobalWorkOffset[2]}));
break;
namespace native_cpu {
struct NDRDescT {
using RangeT = std::array<size_t, 3>;
uint32_t WorkDim;
RangeT GlobalOffset;
RangeT GlobalSize;
RangeT LocalSize;
NDRDescT(uint32_t WorkDim, const size_t *GlobalWorkOffset,
const size_t *GlobalWorkSize, const size_t *LocalWorkSize) {
for (uint32_t I = 0; I < WorkDim; I++) {
GlobalOffset[I] = GlobalWorkOffset[I];
GlobalSize[I] = GlobalWorkSize[I];
LocalSize[I] = LocalWorkSize[I];
}
for (uint32_t I = WorkDim; I < 3; I++) {
GlobalSize[I] = 1;
LocalSize[I] = LocalSize[0] ? 1 : 0;
GlobalOffset[I] = 0;
}
}
return Res;
}
};
} // namespace native_cpu

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
Expand All @@ -62,11 +59,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(

// TODO: add proper error checking
// TODO: add proper event dep management
sycl::detail::NDRDescT ndr =
getNDRDesc(workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize);
native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize);
hKernel->handleLocalArgs();

__nativecpu_state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
ndr.LocalSize[2], ndr.GlobalOffset[0],
ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
Expand Down Expand Up @@ -124,7 +121,7 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl(
ur_rect_region_t region, size_t BufferRowPitch, size_t BufferSlicePitch,
size_t HostRowPitch, size_t HostSlicePitch,
typename std::conditional<IsRead, void *, const void *>::type DstMem,
pi_uint32, const ur_event_handle_t *, ur_event_handle_t *) {
uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
// TODO: events, blocking, check other constraints, performance optimizations
// More sharing with level_zero where possible

Expand Down
Loading

0 comments on commit 986a7ec

Please sign in to comment.