Skip to content

Commit

Permalink
Merge branch 'sycl' into maxime/bugfix-usm-memset-shortcut
Browse files Browse the repository at this point in the history
  • Loading branch information
mfrancepillois committed Feb 9, 2024
2 parents 19166bb + 1f37b5e commit a3837e5
Show file tree
Hide file tree
Showing 160 changed files with 2,276 additions and 706 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,16 @@
// of the split is new modules containing corresponding callgraph.
//===----------------------------------------------------------------------===//

#pragma once
#ifndef LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
#define LLVM_SYCLLOWERIR_MODULE_SPLITTER_H

#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Error.h"

#include <memory>
#include <string>
#include <vector>

namespace llvm {
Expand Down Expand Up @@ -229,8 +231,8 @@ class ModuleSplitterBase {
// For device global variables with the 'device_image_scope' property,
// the function checks that there are no usages of a single device global
// variable from kernels grouped to different modules. Otherwise, an error is
// issued and the tool is aborted.
void verifyNoCrossModuleDeviceGlobalUsage();
// returned.
Error verifyNoCrossModuleDeviceGlobalUsage();

virtual ~ModuleSplitterBase() = default;

Expand Down Expand Up @@ -262,3 +264,5 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,
} // namespace module_split

} // namespace llvm

#endif // LLVM_SYCLLOWERIR_MODULE_SPLITTER_H
1 change: 1 addition & 0 deletions llvm/lib/SYCLLowerIR/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
LowerInvokeSimd.cpp
LowerWGLocalMemory.cpp
LowerWGScope.cpp
ModuleSplitter.cpp
MutatePrintfAddrspace.cpp
SYCLAddOptLevelAttribute.cpp
SYCLPropagateAspectsUsage.cpp
Expand Down
38 changes: 38 additions & 0 deletions llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,37 @@ static void translateGatherLoad(CallInst &CI, bool IsSLM) {
CI.replaceAllUsesWith(LI);
}

static void translateScatterStore(CallInst &CI, bool IsSLM) {
IRBuilder<> Builder(&CI);
constexpr int AlignmentTemplateArgIdx = 2;
APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx,
ESIMDIntrinDesc::GenXArgConversion::TO_I64);
Align AlignValue(Val.getZExtValue());

auto ValsOp = CI.getArgOperand(0);
auto OffsetsOp = CI.getArgOperand(1);
auto MaskOp = CI.getArgOperand(2);
auto DataType = ValsOp->getType();

// Convert the mask from <N x i16> to <N x i1>.
Value *Zero = ConstantInt::get(MaskOp->getType(), 0);
MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero);

// The address space may be 3-SLM, 1-global or private.
// At the moment of calling 'scatter()' operation the pointer passed to it
// is already 4-generic. Thus, simply use 4-generic for global and private
// and let GPU BE deduce the actual address space from the use-def graph.
unsigned AS = IsSLM ? 3 : 4;
auto ElemType = DataType->getScalarType();
auto NumElems = (cast<VectorType>(DataType))->getElementCount();
auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems);
auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType);

auto SI = Builder.CreateMaskedScatter(ValsOp, VPtrOp, AlignValue, MaskOp);
SI->setDebugLoc(CI.getDebugLoc());
CI.replaceAllUsesWith(SI);
}

// TODO Specify document behavior for slm_init and nbarrier_init when:
// 1) they are called not from kernels
// 2) there are multiple such calls reachable from a kernel
Expand Down Expand Up @@ -1987,6 +2018,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F,
continue;
}

if (Name.starts_with("__esimd_scatter_st") ||
Name.starts_with("__esimd_slm_scatter_st")) {
translateScatterStore(*CI, Name.starts_with("__esimd_slm_scatter_st"));
ToErase.push_back(CI);
continue;
}

if (Name.starts_with("__esimd_nbarrier_init")) {
translateNbarrierInit(*CI);
ToErase.push_back(CI);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@
// See comments in the header.
//===----------------------------------------------------------------------===//

#include "ModuleSplitter.h"
#include "Support.h"

#include "llvm/SYCLLowerIR/ModuleSplitter.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/StringExtras.h"
Expand All @@ -23,6 +21,7 @@
#include "llvm/SYCLLowerIR/DeviceGlobals.h"
#include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
#include "llvm/SYCLLowerIR/SYCLUtils.h"
#include "llvm/Support/Error.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
Expand Down Expand Up @@ -426,14 +425,15 @@ class ModuleSplitter : public ModuleSplitterBase {
DependencyGraph CG;
};
} // namespace

namespace llvm {
namespace module_split {

void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
Error ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
const Module &M = getInputModule();
// Early exit if there is only one group
if (Groups.size() < 2)
return;
return Error::success();

// Reverse the EntryPointGroupMap to get a map of entry point -> module's name
unsigned EntryPointNumber = 0;
Expand All @@ -451,19 +451,25 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {

std::optional<StringRef> VarEntryPointModule{};
auto CheckEntryPointModule = [&VarEntryPointModule, &EntryPointModules,
&GV](const auto *F) {
&GV](const auto *F) -> Error {
auto EntryPointModulesIt = EntryPointModules.find(F);
assert(EntryPointModulesIt != EntryPointModules.end() &&
"There is no group for an entry point");
if (EntryPointModulesIt == EntryPointModules.end())
return createStringError(inconvertibleErrorCode(),
"There is no group for an entry point");

if (!VarEntryPointModule.has_value()) {
VarEntryPointModule = EntryPointModulesIt->second;
return;
}
if (EntryPointModulesIt->second != *VarEntryPointModule) {
error("device_global variable '" + Twine(GV.getName()) +
"' with property \"device_image_scope\" is used in more "
"than one device image.");
return Error::success();
}

if (EntryPointModulesIt->second != *VarEntryPointModule)
return createStringError(
inconvertibleErrorCode(),
"device_global variable '" + Twine(GV.getName()) +
"' with property \"device_image_scope\" is used in more "
"than one device image.");

return Error::success();
};

SmallSetVector<const User *, 32> Workqueue;
Expand All @@ -478,13 +484,18 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
continue;
}
if (auto *F = dyn_cast<const Function>(U)) {
if (EntryPointModules.count(F))
CheckEntryPointModule(F);
if (EntryPointModules.count(F)) {
auto E = CheckEntryPointModule(F);
if (E)
return E;
}
}
for (auto *UU : U->users())
Workqueue.insert(UU);
}
}

return Error::success();
}

#ifndef NDEBUG
Expand Down
1 change: 0 additions & 1 deletion llvm/tools/sycl-post-link/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ include_directories(

add_llvm_tool(sycl-post-link
sycl-post-link.cpp
ModuleSplitter.cpp
SpecConstants.cpp
SYCLDeviceLibReqMask.cpp
SYCLKernelParamOptInfo.cpp
Expand Down
2 changes: 1 addition & 1 deletion llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
//===----------------------------------------------------------------------===//

#include "SYCLDeviceRequirements.h"
#include "ModuleSplitter.h"

#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Module.h"
#include "llvm/SYCLLowerIR/ModuleSplitter.h"
#include "llvm/Support/PropertySetIO.h"

#include <set>
Expand Down
9 changes: 6 additions & 3 deletions llvm/tools/sycl-post-link/sycl-post-link.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
// - specialization constant intrinsic transformation
//===----------------------------------------------------------------------===//

#include "ModuleSplitter.h"
#include "SYCLDeviceLibReqMask.h"
#include "SYCLDeviceRequirements.h"
#include "SYCLKernelParamOptInfo.h"
Expand All @@ -40,6 +39,7 @@
#include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h"
#include "llvm/SYCLLowerIR/HostPipes.h"
#include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
#include "llvm/SYCLLowerIR/ModuleSplitter.h"
#include "llvm/SYCLLowerIR/SYCLUtils.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
Expand Down Expand Up @@ -1009,8 +1009,11 @@ processInputModule(std::unique_ptr<Module> M) {
Modified |= SplitOccurred;

// FIXME: this check is not performed for ESIMD splits
if (DeviceGlobals)
Splitter->verifyNoCrossModuleDeviceGlobalUsage();
if (DeviceGlobals) {
auto E = Splitter->verifyNoCrossModuleDeviceGlobalUsage();
if (E)
error(toString(std::move(E)));
}

// It is important that we *DO NOT* preserve all the splits in memory at the
// same time, because it leads to a huge RAM consumption by the tool on bigger
Expand Down
20 changes: 10 additions & 10 deletions sycl/doc/design/CompilerAndRuntimeDesign.md
Original file line number Diff line number Diff line change
Expand Up @@ -567,8 +567,8 @@ Unlike other AOT targets, the bitcode module linked from intermediate compiled
objects never goes through SPIR-V. Instead it is passed directly in bitcode form
down to the NVPTX Back End. All produced bitcode depends on two libraries,
`libdevice.bc` (provided by the CUDA SDK) and `libspirv-nvptx64--nvidiacl.bc` variants
(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly.
Instead it is used to generate remangled variants
(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly.
Instead it is used to generate remangled variants
`remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` and
`remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` to handle primitive type
differences between Linux and Windows.
Expand Down Expand Up @@ -600,14 +600,14 @@ path in SYCL kernels.

##### NVPTX Builtins

Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long`
Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long`
types as 64 bit and has no `long long` types while Windows DPC++ treats `long`
types like 32-bit integers and `long long` types like 64-bit integers.
Differences between the primitive types can cause applications to use
incompatible libclc built-ins. A remangler creates multiple libspriv files
with different remangled function names to support both Windows and Linux.
When building a SYCL application targeting the CUDA backend the driver
will link the device code with
types like 32-bit integers and `long long` types like 64-bit integers.
Differences between the primitive types can cause applications to use
incompatible libclc built-ins. A remangler creates multiple libspirv files
with different remangled function names to support both Windows and Linux.
When building a SYCL application targeting the CUDA backend the driver
will link the device code with
`remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is
Windows or it will link the device code with
`remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is
Expand Down Expand Up @@ -916,7 +916,7 @@ template <typename T, address_space AS> class multi_ptr {
// DecoratedType<T, global_space>::type == "__attribute__((opencl_global)) T"
// See sycl/include/sycl/access/access.hpp for more details
using pointer_t = typename DecoratedType<T, AS>::type *;
pointer_t m_Pointer;
public:
pointer_t get() { return m_Pointer; }
Expand Down
12 changes: 10 additions & 2 deletions sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
:encoding: utf-8
:lang: en
:dpcpp: pass:[DPC++]
:endnote: &#8212;{nbsp}end{nbsp}note

// Set the default source code type in this document to C++,
// for syntax highlighting purposes. This is needed because
Expand Down Expand Up @@ -101,8 +102,8 @@ available only in host code as noted below.
The complex type is trivially copyable and type trait `is_device_copyable`
should resolve to `std::true_type`.

The `T` template parameter must be one of the types float, double, or
sycl::half.
_Constraints_: The `T` template parameter must be one of the types `float`,
`double`, or `sycl::half`.

Note: When performing operations between complex numbers and decimals,
the decimal is treated as a complex number with a real component equal to
Expand Down Expand Up @@ -335,6 +336,13 @@ Additionally, this extension introduces support for the `real` and `imag` free
functions, which returns the real and imaginary component of a number,
respectively.

[_Note:_ The overloads of the functions `real(T)` and `imag(T)` match the
behavior in ISO C++ where `T` would be treated as a complex number with a zero
imaginary component. This is subject to the constraint that `T` must be one of
the types `float`, `double`, `sycl::half`, or evaluate to `true` for
`std::is_integral`.
_{endnote}_]

These functions are available in both host and device code, and each math
function should follow the C++ standard for handling `NaN` and `Inf` values.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
== Notice

[%hardbreaks]
Copyright (C) 2022-2023 Intel Corporation. All rights reserved.
Copyright (C) 2022-2024 Intel Corporation. All rights reserved.

Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by
Expand Down Expand Up @@ -383,9 +383,11 @@ and work-groups to also provide concurrent forward progress guarantees). In
such a case, an implementation must satisfy the strongest request(s).

Devices may not be able to provide the requested forward progress guarantees
for all launch configurations. The <<launch, launch queries>> defined in a
later section allow developers to identify valid launch configurations for
specific combinations of properties.
for all launch configurations. Developers should use the launch queries defined
by the
link:../proposed/sycl_ext_oneapi_launch_queries.asciidoc[sycl_ext_oneapi_launch_queries]
extension to identify valid launch configurations for specific combinations of
properties.

[NOTE]
====
Expand Down
Loading

0 comments on commit a3837e5

Please sign in to comment.