diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h similarity index 97% rename from llvm/tools/sycl-post-link/ModuleSplitter.h rename to llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h index aaf6108325765..eb09e7528ff49 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.h +++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h @@ -10,7 +10,8 @@ // of the split is new modules containing corresponding callgraph. //===----------------------------------------------------------------------===// -#pragma once +#ifndef LLVM_SYCLLOWERIR_MODULE_SPLITTER_H +#define LLVM_SYCLLOWERIR_MODULE_SPLITTER_H #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" @@ -18,6 +19,7 @@ #include "llvm/Support/Error.h" #include +#include #include namespace llvm { @@ -229,8 +231,8 @@ class ModuleSplitterBase { // For device global variables with the 'device_image_scope' property, // the function checks that there are no usages of a single device global // variable from kernels grouped to different modules. Otherwise, an error is - // issued and the tool is aborted. - void verifyNoCrossModuleDeviceGlobalUsage(); + // returned. + Error verifyNoCrossModuleDeviceGlobalUsage(); virtual ~ModuleSplitterBase() = default; @@ -262,3 +264,5 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false, } // namespace module_split } // namespace llvm + +#endif // LLVM_SYCLLOWERIR_MODULE_SPLITTER_H diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index cfafbe1a66c87..b2afa8150aa4f 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -62,6 +62,7 @@ add_llvm_component_library(LLVMSYCLLowerIR LowerInvokeSimd.cpp LowerWGLocalMemory.cpp LowerWGScope.cpp + ModuleSplitter.cpp MutatePrintfAddrspace.cpp SYCLAddOptLevelAttribute.cpp SYCLPropagateAspectsUsage.cpp diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 6ec1102f402ba..15c5e7c9a625e 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1004,6 +1004,37 @@ static void translateGatherLoad(CallInst &CI, bool IsSLM) { CI.replaceAllUsesWith(LI); } +static void translateScatterStore(CallInst &CI, bool IsSLM) { + IRBuilder<> Builder(&CI); + constexpr int AlignmentTemplateArgIdx = 2; + APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx, + ESIMDIntrinDesc::GenXArgConversion::TO_I64); + Align AlignValue(Val.getZExtValue()); + + auto ValsOp = CI.getArgOperand(0); + auto OffsetsOp = CI.getArgOperand(1); + auto MaskOp = CI.getArgOperand(2); + auto DataType = ValsOp->getType(); + + // Convert the mask from to . + Value *Zero = ConstantInt::get(MaskOp->getType(), 0); + MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero); + + // The address space may be 3-SLM, 1-global or private. + // At the moment of calling 'scatter()' operation the pointer passed to it + // is already 4-generic. Thus, simply use 4-generic for global and private + // and let GPU BE deduce the actual address space from the use-def graph. + unsigned AS = IsSLM ? 3 : 4; + auto ElemType = DataType->getScalarType(); + auto NumElems = (cast(DataType))->getElementCount(); + auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems); + auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType); + + auto SI = Builder.CreateMaskedScatter(ValsOp, VPtrOp, AlignValue, MaskOp); + SI->setDebugLoc(CI.getDebugLoc()); + CI.replaceAllUsesWith(SI); +} + // TODO Specify document behavior for slm_init and nbarrier_init when: // 1) they are called not from kernels // 2) there are multiple such calls reachable from a kernel @@ -1987,6 +2018,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, continue; } + if (Name.starts_with("__esimd_scatter_st") || + Name.starts_with("__esimd_slm_scatter_st")) { + translateScatterStore(*CI, Name.starts_with("__esimd_slm_scatter_st")); + ToErase.push_back(CI); + continue; + } + if (Name.starts_with("__esimd_nbarrier_init")) { translateNbarrierInit(*CI); ToErase.push_back(CI); diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp similarity index 97% rename from llvm/tools/sycl-post-link/ModuleSplitter.cpp rename to llvm/lib/SYCLLowerIR/ModuleSplitter.cpp index 87cbf42da2df2..92ff992141945 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp +++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp @@ -8,9 +8,7 @@ // See comments in the header. //===----------------------------------------------------------------------===// -#include "ModuleSplitter.h" -#include "Support.h" - +#include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" @@ -23,6 +21,7 @@ #include "llvm/SYCLLowerIR/DeviceGlobals.h" #include "llvm/SYCLLowerIR/LowerInvokeSimd.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" +#include "llvm/Support/Error.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/Transforms/IPO/StripDeadPrototypes.h" @@ -426,14 +425,15 @@ class ModuleSplitter : public ModuleSplitterBase { DependencyGraph CG; }; } // namespace + namespace llvm { namespace module_split { -void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { +Error ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { const Module &M = getInputModule(); // Early exit if there is only one group if (Groups.size() < 2) - return; + return Error::success(); // Reverse the EntryPointGroupMap to get a map of entry point -> module's name unsigned EntryPointNumber = 0; @@ -451,19 +451,25 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { std::optional VarEntryPointModule{}; auto CheckEntryPointModule = [&VarEntryPointModule, &EntryPointModules, - &GV](const auto *F) { + &GV](const auto *F) -> Error { auto EntryPointModulesIt = EntryPointModules.find(F); - assert(EntryPointModulesIt != EntryPointModules.end() && - "There is no group for an entry point"); + if (EntryPointModulesIt == EntryPointModules.end()) + return createStringError(inconvertibleErrorCode(), + "There is no group for an entry point"); + if (!VarEntryPointModule.has_value()) { VarEntryPointModule = EntryPointModulesIt->second; - return; - } - if (EntryPointModulesIt->second != *VarEntryPointModule) { - error("device_global variable '" + Twine(GV.getName()) + - "' with property \"device_image_scope\" is used in more " - "than one device image."); + return Error::success(); } + + if (EntryPointModulesIt->second != *VarEntryPointModule) + return createStringError( + inconvertibleErrorCode(), + "device_global variable '" + Twine(GV.getName()) + + "' with property \"device_image_scope\" is used in more " + "than one device image."); + + return Error::success(); }; SmallSetVector Workqueue; @@ -478,13 +484,18 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { continue; } if (auto *F = dyn_cast(U)) { - if (EntryPointModules.count(F)) - CheckEntryPointModule(F); + if (EntryPointModules.count(F)) { + auto E = CheckEntryPointModule(F); + if (E) + return E; + } } for (auto *UU : U->users()) Workqueue.insert(UU); } } + + return Error::success(); } #ifndef NDEBUG diff --git a/llvm/tools/sycl-post-link/CMakeLists.txt b/llvm/tools/sycl-post-link/CMakeLists.txt index 40bd3f899e487..3905e836aaae8 100644 --- a/llvm/tools/sycl-post-link/CMakeLists.txt +++ b/llvm/tools/sycl-post-link/CMakeLists.txt @@ -24,7 +24,6 @@ include_directories( add_llvm_tool(sycl-post-link sycl-post-link.cpp - ModuleSplitter.cpp SpecConstants.cpp SYCLDeviceLibReqMask.cpp SYCLKernelParamOptInfo.cpp diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp index 70d3a90f51785..5255ce7bf2a66 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp @@ -7,11 +7,11 @@ //===----------------------------------------------------------------------===// #include "SYCLDeviceRequirements.h" -#include "ModuleSplitter.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Module.h" +#include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/Support/PropertySetIO.h" #include diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 85728be8aabf0..068962b63c1fc 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -13,7 +13,6 @@ // - specialization constant intrinsic transformation //===----------------------------------------------------------------------===// -#include "ModuleSplitter.h" #include "SYCLDeviceLibReqMask.h" #include "SYCLDeviceRequirements.h" #include "SYCLKernelParamOptInfo.h" @@ -40,6 +39,7 @@ #include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h" #include "llvm/SYCLLowerIR/HostPipes.h" #include "llvm/SYCLLowerIR/LowerInvokeSimd.h" +#include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" @@ -1009,8 +1009,11 @@ processInputModule(std::unique_ptr M) { Modified |= SplitOccurred; // FIXME: this check is not performed for ESIMD splits - if (DeviceGlobals) - Splitter->verifyNoCrossModuleDeviceGlobalUsage(); + if (DeviceGlobals) { + auto E = Splitter->verifyNoCrossModuleDeviceGlobalUsage(); + if (E) + error(toString(std::move(E))); + } // It is important that we *DO NOT* preserve all the splits in memory at the // same time, because it leads to a huge RAM consumption by the tool on bigger diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index c9d9e2b7eba90..ccc0f642f992f 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -567,8 +567,8 @@ Unlike other AOT targets, the bitcode module linked from intermediate compiled objects never goes through SPIR-V. Instead it is passed directly in bitcode form down to the NVPTX Back End. All produced bitcode depends on two libraries, `libdevice.bc` (provided by the CUDA SDK) and `libspirv-nvptx64--nvidiacl.bc` variants -(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly. -Instead it is used to generate remangled variants +(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly. +Instead it is used to generate remangled variants `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` and `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` to handle primitive type differences between Linux and Windows. @@ -600,14 +600,14 @@ path in SYCL kernels. ##### NVPTX Builtins -Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long` +Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long` types as 64 bit and has no `long long` types while Windows DPC++ treats `long` -types like 32-bit integers and `long long` types like 64-bit integers. -Differences between the primitive types can cause applications to use -incompatible libclc built-ins. A remangler creates multiple libspriv files -with different remangled function names to support both Windows and Linux. -When building a SYCL application targeting the CUDA backend the driver -will link the device code with +types like 32-bit integers and `long long` types like 64-bit integers. +Differences between the primitive types can cause applications to use +incompatible libclc built-ins. A remangler creates multiple libspirv files +with different remangled function names to support both Windows and Linux. +When building a SYCL application targeting the CUDA backend the driver +will link the device code with `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is Windows or it will link the device code with `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is @@ -916,7 +916,7 @@ template class multi_ptr { // DecoratedType::type == "__attribute__((opencl_global)) T" // See sycl/include/sycl/access/access.hpp for more details using pointer_t = typename DecoratedType::type *; - + pointer_t m_Pointer; public: pointer_t get() { return m_Pointer; } diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc index 41700a898fecb..e4eb444f1ed60 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc @@ -10,6 +10,7 @@ :encoding: utf-8 :lang: en :dpcpp: pass:[DPC++] +:endnote: —{nbsp}end{nbsp}note // Set the default source code type in this document to C++, // for syntax highlighting purposes. This is needed because @@ -101,8 +102,8 @@ available only in host code as noted below. The complex type is trivially copyable and type trait `is_device_copyable` should resolve to `std::true_type`. -The `T` template parameter must be one of the types float, double, or -sycl::half. +_Constraints_: The `T` template parameter must be one of the types `float`, +`double`, or `sycl::half`. Note: When performing operations between complex numbers and decimals, the decimal is treated as a complex number with a real component equal to @@ -335,6 +336,13 @@ Additionally, this extension introduces support for the `real` and `imag` free functions, which returns the real and imaginary component of a number, respectively. +[_Note:_ The overloads of the functions `real(T)` and `imag(T)` match the +behavior in ISO C++ where `T` would be treated as a complex number with a zero +imaginary component. This is subject to the constraint that `T` must be one of +the types `float`, `double`, `sycl::half`, or evaluate to `true` for +`std::is_integral`. +_{endnote}_] + These functions are available in both host and device code, and each math function should follow the C++ standard for handling `NaN` and `Inf` values. diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc index 4c1df1640d021..227b23f1ffdac 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc @@ -20,7 +20,7 @@ == Notice [%hardbreaks] -Copyright (C) 2022-2023 Intel Corporation. All rights reserved. +Copyright (C) 2022-2024 Intel Corporation. All rights reserved. Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by @@ -383,9 +383,11 @@ and work-groups to also provide concurrent forward progress guarantees). In such a case, an implementation must satisfy the strongest request(s). Devices may not be able to provide the requested forward progress guarantees -for all launch configurations. The <> defined in a -later section allow developers to identify valid launch configurations for -specific combinations of properties. +for all launch configurations. Developers should use the launch queries defined +by the +link:../proposed/sycl_ext_oneapi_launch_queries.asciidoc[sycl_ext_oneapi_launch_queries] +extension to identify valid launch configurations for specific combinations of +properties. [NOTE] ==== diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc index 0ef2126d86fa1..44eb1b376d113 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc @@ -182,7 +182,8 @@ inline constexpr use_root_sync_key::value_t use_root_sync; === The `root_group` class The `root_group` class implements all member functions common to the -`sycl::group` and `sycl::sub_group` classes. +`sycl::group` and `sycl::sub_group` classes and also contains own +additional functions. [source,c++] ---- @@ -191,6 +192,13 @@ namespace ext { namespace oneapi { namespace experimental { +enum class execution_scope { + work_item, + sub_group, + work_group, + root_group, +}; + template class root_group { public: @@ -221,6 +229,31 @@ public: bool leader() const; + template + std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + id> + get_id() const; + + template + std::enable_if_t> get_id() const; + + template + size_t get_linear_id() const; + + template + std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + range> + get_range() const; + + template + std::enable_if_t> + get_range() const; + + template + size_t get_linear_range() const; + }; } // namespace experimental @@ -307,6 +340,67 @@ work-item is the leader of the root-group, and `false` for all other work-items in the root-group. The leader of the root-group is guaranteed to be the work-item for which `get_local_id()` returns 0. +[source,c++] +---- +template +std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + id> +get_id() const; +---- +_Returns_: An `id` representing the index of the current work-group or work-item at `Scope` +hierarchy level within the `root_group` object. + +[source,c++] +---- +template +std::enable_if_t> get_id() const; +---- +_Returns_: An `id` representing the index of the current sub-group within the +`root_group` object. + +[source,c++] +---- +template +size_t get_linear_id() const; +---- +_Constraints_: `Scope` must be narrower than +`execution_scope::root_group`. + +_Returns_: A linearized number of the current work-group or work-item at `Scope` hierarchy +level within the `root_group` object. + +[source,c++] +---- +template +std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + range> +get_range() const; +---- +_Returns_: A `range` representing the number of work-groups or work-items of `Scope` +hierarchy level within the `root_group` object. + +[source,c++] +---- +template +std::enable_if_t> +get_range() const; +---- +_Returns_: A `range` representing the number of sub-groups within the `root_group` +object. + +[source,c++] +---- +template +size_t get_linear_range() const; +---- +_Constraints_: `Scope` must be narrower than +`execution_scope::root_group`. + +_Returns_: The number of work-groups or work-items of `Scope` hierarchy level within the +`root_group` object. + === Using a `root_group` diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index ea1a6580d30e6..9af5b7e75ae38 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -174,10 +174,11 @@ extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL *, Ts val, size_t i); -template -extern __DPCPP_SYCL_EXTERNAL void __spirv_JointMatrixPrefetchINTEL( - T *Ptr, std::size_t coordX, std::size_t coordY, unsigned int CacheLevel, - __spv::MatrixLayout Layout, std::size_t Stride); +template +extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixPrefetchINTEL( + T *Ptr, std::size_t coordX, std::size_t coordY, std::size_t NumRows, + std::size_t NumCols, unsigned int CacheLevel, __spv::MatrixLayout Layout, + std::size_t Stride); #ifndef __SPIRV_BUILTIN_DECLARATIONS__ #error \ diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp index bc497c540693b..91fd50d2ec4f2 100644 --- a/sycl/include/sycl/builtins_preview.hpp +++ b/sycl/include/sycl/builtins_preview.hpp @@ -137,7 +137,14 @@ auto builtin_marray_impl(FuncTy F, const Ts &...x) { marray Res; constexpr auto N = T::size(); for (size_t I = 0; I < N / 2; ++I) { - auto PartialRes = F(to_vec2(x, I * 2)...); + auto PartialRes = [&]() { + using elem_ty = get_elem_type_t; + if constexpr (std::is_integral_v) + return F(to_vec2(x, I * 2) + .template as, 2>>()...); + else + return F(to_vec2(x, I * 2)...); + }(); std::memcpy(&Res[I * 2], &PartialRes, sizeof(decltype(PartialRes))); } if (N % 2) @@ -175,12 +182,6 @@ auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) { } } -template -struct any_elem_type - : std::bool_constant, float, double, half, char, signed char, short, - int, long, long long, unsigned char, unsigned short, unsigned int, - unsigned long, unsigned long long>> {}; template struct fp_elem_type : std::bool_constant< @@ -188,16 +189,6 @@ struct fp_elem_type template struct float_elem_type : std::bool_constant, float>> {}; -template -struct integer_elem_type - : std::bool_constant< - check_type_in_v, char, signed char, short, int, - long, long long, unsigned char, unsigned short, - unsigned int, unsigned long, unsigned long long>> {}; -template -struct suint32_elem_type - : std::bool_constant< - check_type_in_v, int32_t, uint32_t>> {}; template struct same_basic_shape : std::bool_constant> {}; @@ -244,13 +235,6 @@ struct builtin_enable SHAPE_CHECKER, EXTRA_CONDITIONS, Ts...>::type; \ } } // namespace detail - -BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type, - any_shape, same_elem_type) -BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type, - any_elem_type, scalar_only, same_elem_type) -BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type, - any_elem_type, non_scalar_only, same_elem_type) } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/builtins_utils_scalar.hpp b/sycl/include/sycl/builtins_utils_scalar.hpp index 62d86df045cdf..31f267892243b 100644 --- a/sycl/include/sycl/builtins_utils_scalar.hpp +++ b/sycl/include/sycl/builtins_utils_scalar.hpp @@ -128,6 +128,17 @@ template struct get_unsigned_int_by_size { template struct same_size_unsigned_int { using type = typename get_unsigned_int_by_size::type; }; +template +using same_size_unsigned_int_t = typename same_size_unsigned_int::type; + +template struct get_fixed_sized_int { + static_assert(std::is_integral_v); + using type = + std::conditional_t, same_size_signed_int_t, + same_size_unsigned_int_t>; +}; +template +using get_fixed_sized_int_t = typename get_fixed_sized_int::type; // Utility trait for getting an upsampled integer type. // NOTE: For upsampling we look for an integer of double the size of the diff --git a/sycl/include/sycl/detail/builtins/common_functions.inc b/sycl/include/sycl/detail/builtins/common_functions.inc index fb10964934cf0..022cab78e51db 100644 --- a/sycl/include/sycl/detail/builtins/common_functions.inc +++ b/sycl/include/sycl/detail/builtins/common_functions.inc @@ -72,32 +72,15 @@ min(T x, detail::get_elem_type_t y) { detail::simplify_if_swizzle_t{y}); } -#undef BUILTIN_COMMON - -#ifdef __SYCL_DEVICE_ONLY__ -DEVICE_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t, - [](auto... xs) { - using ElemTy = detail::get_elem_type_t; - if constexpr (std::is_integral_v) { - if constexpr (std::is_signed_v) { - return __spirv_ocl_s_clamp(xs...); - } else { - return __spirv_ocl_u_clamp(xs...); - } - } else { - return __spirv_ocl_fclamp(xs...); - } - }) -#else -HOST_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t, common, - default_ret_type) -#endif +BUILTIN_COMMON(THREE_ARGS, clamp, __spirv_ocl_fclamp) template -detail::builtin_enable_generic_non_scalar_t +detail::builtin_enable_common_non_scalar_t clamp(T x, detail::get_elem_type_t y, detail::get_elem_type_t z) { return clamp(detail::simplify_if_swizzle_t{x}, detail::simplify_if_swizzle_t{y}, detail::simplify_if_swizzle_t{z}); } + +#undef BUILTIN_COMMON } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/detail/builtins/helper_macros.hpp b/sycl/include/sycl/detail/builtins/helper_macros.hpp index 49d4af8981d70..38014c4b62c8f 100644 --- a/sycl/include/sycl/detail/builtins/helper_macros.hpp +++ b/sycl/include/sycl/detail/builtins/helper_macros.hpp @@ -48,6 +48,11 @@ FOR_EACH4_A6(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3, \ ARG4, ARG5, ARG6) \ BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG7) +#define FOR_EACH4_A8(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, \ + ARG3, ARG4, ARG5, ARG6, ARG7, ARG8) \ + FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3, \ + ARG4, ARG5, ARG6, ARG7) \ + BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG8) #define FOR_EACH4_A11(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, \ ARG3, ARG4, ARG5, ARG6, ARG7, ARG8, ARG9, ARG10, ARG11) \ FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3, \ @@ -169,6 +174,9 @@ unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long // 11 types #define INTEGER_TYPES SIGNED_TYPES, UNSIGNED_TYPES +// 8 types +#define FIXED_WIDTH_INTEGER_TYPES \ + int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t #define DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE( \ NUM_ARGS, NAME, ENABLER, DELEGATOR, NS, /*SCALAR_VEC_IMPL*/...) \ diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc index dfeb815e52494..44699765ff7fd 100644 --- a/sycl/include/sycl/detail/builtins/integer_functions.inc +++ b/sycl/include/sycl/detail/builtins/integer_functions.inc @@ -12,6 +12,19 @@ namespace sycl { inline namespace _V1 { +namespace detail { +template +struct integer_elem_type + : std::bool_constant< + (is_vec_or_swizzle_v && + check_type_in_v, FIXED_WIDTH_INTEGER_TYPES>) || + (!is_vec_or_swizzle_v && + check_type_in_v, INTEGER_TYPES>)> {}; +template +struct suint32_elem_type + : std::bool_constant< + check_type_in_v, int32_t, uint32_t>> {}; +} // namespace detail BUILTIN_CREATE_ENABLER(builtin_enable_integer, default_ret_type, integer_elem_type, any_shape, same_elem_type) BUILTIN_CREATE_ENABLER(builtin_enable_integer_non_scalar, default_ret_type, @@ -122,6 +135,15 @@ min(T x, detail::get_elem_type_t y) { detail::simplify_if_swizzle_t{y}); } +BUILTIN_GENINT_SU(THREE_ARGS, clamp) +template +detail::builtin_enable_integer_non_scalar_t +clamp(T x, detail::get_elem_type_t y, detail::get_elem_type_t z) { + return clamp(detail::simplify_if_swizzle_t{x}, + detail::simplify_if_swizzle_t{y}, + detail::simplify_if_swizzle_t{z}); +} + BUILTIN_GENINT(ONE_ARG, clz) BUILTIN_GENINT(ONE_ARG, ctz) BUILTIN_GENINT(ONE_ARG, popcount) diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc index d23f144676cdb..d63a7716aa9b7 100644 --- a/sycl/include/sycl/detail/builtins/relational_functions.inc +++ b/sycl/include/sycl/detail/builtins/relational_functions.inc @@ -13,6 +13,15 @@ namespace sycl { inline namespace _V1 { namespace detail { +template +struct bitselect_elem_type + : std::bool_constant< + check_type_in_v, FP_TYPES> || + (is_vec_or_swizzle_v && + check_type_in_v, FIXED_WIDTH_INTEGER_TYPES>) || + (!is_vec_or_swizzle_v && + check_type_in_v, INTEGER_TYPES>)> {}; + template struct rel_ret_traits : std::conditional, bool, @@ -21,6 +30,9 @@ struct rel_ret_traits same_size_signed_int_t>>> { }; } // namespace detail + +BUILTIN_CREATE_ENABLER(builtin_enable_bitselect, default_ret_type, + bitselect_elem_type, any_shape, same_elem_type) BUILTIN_CREATE_ENABLER(builtin_enable_rel, rel_ret_traits, fp_elem_type, non_scalar_only, same_elem_type) @@ -119,13 +131,14 @@ BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet) #ifdef __SYCL_DEVICE_ONLY__ DEVICE_IMPL_TEMPLATE( - THREE_ARGS, bitselect, builtin_enable_generic_t, [](auto... xs) { - using ret_ty = detail::builtin_enable_generic_t; + THREE_ARGS, bitselect, builtin_enable_bitselect_t, [](auto... xs) { + using ret_ty = + detail::builtin_enable_bitselect_t; using detail::builtins::convert_result; return convert_result(__spirv_ocl_bitselect(xs...)); }) #else -HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel, +HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_bitselect_t, rel, default_ret_type) #endif diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index d723d03353cb2..c9d0d6cc4d410 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -213,6 +213,20 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t __esimd_slm_gather_ld( __ESIMD_DNS::simd_mask_storage_t pred, __ESIMD_DNS::vector_type_t pass_thru) __ESIMD_INTRIN_END; +// Scatter data to given global or private addresses. +template +__ESIMD_INTRIN void +__esimd_scatter_st(__ESIMD_DNS::vector_type_t vals, + __ESIMD_DNS::vector_type_t vptr, + __ESIMD_DNS::simd_mask_storage_t pred) __ESIMD_INTRIN_END; + +// Scatter data to given SLM addresses. +template +__ESIMD_INTRIN void __esimd_slm_scatter_st( + __ESIMD_DNS::vector_type_t vals, + __ESIMD_DNS::vector_type_t vptr, + __ESIMD_DNS::simd_mask_storage_t pred) __ESIMD_INTRIN_END; + /// Surface-based gather. /// Supported platforms: DG2, PVC /// diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index d923821027339..4ffd5c418138c 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -724,11 +724,20 @@ scatter(T *p, simd byte_offsets, simd vals, // Use LSC lowering if L1/L2 or VS > 1. if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) { + VS > 1 || + (!__ESIMD_DNS::isPowerOf2(N, 32) && + !detail::isMaskedGatherScatterLLVMAvailable())) { static_assert(VS == 1 || sizeof(T) >= 4, "VS > 1 is supprted only for 4- and 8-byte elements"); return detail::scatter_impl(p, byte_offsets, vals, mask); + } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { + simd Addrs(reinterpret_cast(p)); + Addrs = Addrs + convert(byte_offsets); + using MsgT = detail::__raw_t; + __esimd_scatter_st( + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), + Addrs.data(), mask.data()); } else { using Tx = detail::__raw_t; simd byte_offsets_i = convert(byte_offsets); @@ -2685,11 +2694,13 @@ block_store(AccessorT acc, simd vals, simd_mask<1> pred, namespace detail { template ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t< - (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) && - (std::is_same_v || - is_accessor_with_v)> + std::is_same_v || + is_accessor_with_v> scatter_impl(AccessorTy acc, simd vals, simd offsets, uint32_t glob_offset, simd_mask mask) { + + static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32), + "Unexpected type or vector length"); constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); // TODO (performance) use hardware-supported scale once BE supports it constexpr int16_t scale = 0; @@ -2820,10 +2831,9 @@ gather_impl(AccessorT acc, simd byte_offsets, /// @return is a vector of type T and size N * NElts. /// template -__ESIMD_API __ESIMD_NS::simd -slm_gather_impl(__ESIMD_NS::simd offsets, - __ESIMD_NS::simd_mask pred, - __ESIMD_NS::simd pass_thru) { +__ESIMD_API simd slm_gather_impl(simd offsets, + simd_mask pred, + simd pass_thru) { check_lsc_vector_size(); check_lsc_data_size(); constexpr uint16_t AddressScale = 1; @@ -2832,9 +2842,8 @@ slm_gather_impl(__ESIMD_NS::simd offsets, constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; - __ESIMD_NS::simd PassThruExpanded = - lsc_format_input(pass_thru); - __ESIMD_NS::simd Result = + simd PassThruExpanded = lsc_format_input(pass_thru); + simd Result = __esimd_lsc_load_merge_slm(pred.data(), offsets.data(), @@ -2842,6 +2851,37 @@ slm_gather_impl(__ESIMD_NS::simd offsets, return lsc_format_ret(Result); } +/// SLM scatter implementation. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_store.slm +/// +/// Scatters elements located to slm. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to store per address. +/// @tparam DS is the data size. +/// @tparam N is the number of channels (platform dependent). +/// @param offsets is the zero-based offsets for SLM buffer in bytes. +/// @param vals is values to store. +/// @param pred is predicates. +/// +template +__ESIMD_API void slm_scatter_impl(simd offsets, + simd vals, simd_mask pred) { + check_lsc_vector_size(); + check_lsc_data_size(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + simd Tmp = lsc_format_input(vals); + __esimd_lsc_store_slm( + pred.data(), offsets.data(), Tmp.data()); +} + } // namespace detail /// @endcond ESIMD_DETAIL @@ -3903,7 +3943,7 @@ slm_gather(simd byte_offsets, simd_mask mask, static_assert(Alignment >= sizeof(T), "slm_gather() requires at least element-size alignment"); - if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) && + if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && !detail::isMaskedGatherScatterLLVMAvailable())) { simd PassThru; // Intentionally undefined return detail::slm_gather_impl( @@ -4118,7 +4158,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// @param props The optional compile-time properties. Only 'alignment' -/// and cache hint properties are used. +/// property is used. /// @return A vector of elements read. template __ESIMD_API T slm_scalar_load(uint32_t offset) { return Res[0]; } -/// Scatter operation over the Shared Local Memory. -/// This API has almost the same interface as the @ref accessor_scatter -/// "accessor-based scatter", except that it does not have the accessor and -/// the global offset parameters. -/// -template -__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) && - (sizeof(T) <= 4)> -slm_scatter(simd offsets, simd vals, simd_mask mask = 1) { - detail::LocalAccessorMarker acc; - detail::scatter_impl(acc, vals, offsets, 0, mask); +/// template +/// void slm_scatter(simd byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) +/// void slm_scatter(simd byte_offsets, +/// simd vals, PropertyListT props = {}); // (slm-sc-2) +/// +/// The next 2 functions are variations of the first 2 above (slm-sc-1,2) +/// and were added only to support simd_view instead of simd for byte_offsets. +/// template +/// void slm_scatter(OffsetSimdViewT byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-3) +/// void slm_scatter(OffsetSimdViewT byte_offsets, +/// simd vals, PropertyListT props = {}); // (slm-sc-4) + +/// template +/// void slm_scatter(simd byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. Storage of any element +/// can be disabled via the input vector of predicates \p mask. +/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector of values to store. +/// @param mask The access mask, defaults to all 1s. +/// @param props The optional compile-time properties. Only 'alignment' property +/// is used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +slm_scatter(simd byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); + + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); + static_assert(Alignment >= sizeof(T), + "slm_scatter() requires at least element-size alignment"); + + // Use LSC lowering if VS > 1. + if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && + !detail::isMaskedGatherScatterLLVMAvailable())) { + __ESIMD_DNS::slm_scatter_impl( + byte_offsets, vals, mask); + } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { + using MsgT = detail::__raw_t; + __esimd_slm_scatter_st( + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), + byte_offsets.data(), mask.data()); + } else { + detail::LocalAccessorMarker acc; + detail::scatter_impl(acc, vals, byte_offsets, 0, mask); + } +} + +/// template +/// void slm_scatter(simd byte_offsets, simd vals, +/// PropertyListT props = {}); // (slm-sc-2) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// @param vals The vector of values to store. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +slm_scatter(simd byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + slm_scatter(byte_offsets, vals, Mask, props); +} + +/// template +/// void slm_scatter( +/// OffsetSimdViewT byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (slm-sc-3) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. +/// Storage to any element's memory location can be disabled via the +/// input vector of predicates \p mask. If mask[i] is unset, then the storage to +/// (byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector of values to store. +/// @param mask The access mask, defaults to all 1s. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_scatter(OffsetSimdViewT byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + slm_scatter(byte_offsets.read(), vals, mask, props); +} + +/// void slm_scatter( +/// OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (slm-sc-4) +/// Loads ("gathers") elements of the type 'T' from Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets, and returns the loaded +/// elements. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// @param vals The vector of values to store. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_scatter(OffsetSimdViewT byte_offsets, simd vals, + PropertyListT props = {}) { + return slm_scatter(byte_offsets.read(), vals, props); } /// Store a scalar value into the Shared Local Memory. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 7fb0ae833e793..9d5054b4392ba 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1424,21 +1424,7 @@ template offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - using CstT = __ESIMD_DNS::uint_type_t; - __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); - __esimd_lsc_store_slm( - pred.data(), offsets.data(), Tmp.data()); + __ESIMD_DNS::slm_scatter_impl(offsets, vals, pred); } /// Transposed SLM scatter with 1 channel. diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 98aea6f04a48b..a07e9c144ba6a 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -524,8 +524,9 @@ joint_matrix_prefetch(Group sg, T *Ptr, size_t stride, // Will be removed once SPIRV implementation also uses offsetpointer size_t coordX = 0; size_t coordY = 0; - __spirv_JointMatrixPrefetchINTEL( - Ptr, coordX, coordY, detail::PropertyMetaInfo::value, + __spirv_CooperativeMatrixPrefetchINTEL( + Ptr, coordX, coordY, NumRows, NumCols, + detail::PropertyMetaInfo::value, sycl::detail::joint_matrix_layout_to_spv(Layout), stride); #endif // defined(__NVPTX__) #else diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp index a2f934a147c33..c12b9e6781eb9 100644 --- a/sycl/include/sycl/types.hpp +++ b/sycl/include/sycl/types.hpp @@ -987,6 +987,41 @@ template class vec { #endif #ifdef __SYCL_USE_EXT_VECTOR_TYPE__ +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ + friend vec operator BINOP(const vec &Lhs, const vec &Rhs) { \ + vec Ret; \ + if constexpr (IsUsingArrayOnDevice) { \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I))); \ + } \ + } else { \ + Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \ + if constexpr (std::is_same_v && CONVERT) { \ + Ret.ConvertToDataT(); \ + } \ + } \ + return Ret; \ + } \ + friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) { \ + return Lhs BINOP vec(Rhs); \ + } \ + friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) { \ + return vec(Lhs) BINOP Rhs; \ + } \ + friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) { \ + Lhs = Lhs BINOP Rhs; \ + return Lhs; \ + } \ + template \ + friend typename std::enable_if_t operator OPASSIGN( \ + vec & Lhs, const DataT & Rhs) { \ + Lhs = Lhs BINOP vec(Rhs); \ + return Lhs; \ + } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) + +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ template \ vec operator BINOP(const EnableIfNotUsingArrayOnDevice &Rhs) const { \ @@ -1024,38 +1059,37 @@ template class vec { *this = *this BINOP vec(Rhs); \ return *this; \ } +#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES) + #else // __SYCL_USE_EXT_VECTOR_TYPE__ #if defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ - vec operator BINOP(const vec &Rhs) const { \ + friend vec operator BINOP(const vec &Lhs, const vec &Rhs) { \ vec Ret{}; \ if constexpr (NativeVec) \ - Ret.m_Data = m_Data BINOP Rhs.m_Data; \ + Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \ else \ for (size_t I = 0; I < NumElements; ++I) \ - Ret.setValue(I, (DataT)(vec_data::get(getValue( \ + Ret.setValue(I, (DataT)(vec_data::get(Lhs.getValue( \ I)) BINOP vec_data::get(Rhs.getValue(I)))); \ return Ret; \ } \ - template \ - typename std::enable_if_t< \ - std::is_convertible_v && \ - (std::is_fundamental_v> || \ - detail::is_half_or_bf16_v>), \ - vec> \ - operator BINOP(const T & Rhs) const { \ - return *this BINOP vec(static_cast(Rhs)); \ + friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) { \ + return Lhs BINOP vec(Rhs); \ } \ - vec &operator OPASSIGN(const vec & Rhs) { \ - *this = *this BINOP Rhs; \ - return *this; \ + friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) { \ + return vec(Lhs) BINOP Rhs; \ + } \ + friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) { \ + Lhs = Lhs BINOP Rhs; \ + return Lhs; \ } \ template \ - typename std::enable_if_t operator OPASSIGN( \ - const DataT & Rhs) { \ - *this = *this BINOP vec(Rhs); \ - return *this; \ + friend typename std::enable_if_t operator OPASSIGN( \ + vec & Lhs, const DataT & Rhs) { \ + Lhs = Lhs BINOP vec(Rhs); \ + return Lhs; \ } #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) @@ -1120,6 +1154,42 @@ template class vec { // Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined // by SYCL device compiler only. #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_RELLOGOP(RELLOGOP) \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const vec & Rhs) { \ + vec Ret{}; \ + /* This special case is needed since there are no standard operator|| */ \ + /* or operator&& functions for std::array. */ \ + if constexpr (IsUsingArrayOnDevice && \ + (std::string_view(#RELLOGOP) == "||" || \ + std::string_view(#RELLOGOP) == "&&")) { \ + for (size_t I = 0; I < NumElements; ++I) { \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + Lhs.getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + } \ + } else { \ + Ret = vec( \ + (typename vec::vector_t)( \ + Lhs.m_Data RELLOGOP Rhs.m_Data)); \ + if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/ \ + Ret *= -1; \ + } \ + return Ret; \ + } \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const DataT & Rhs) { \ + return Lhs RELLOGOP vec(Rhs); \ + } \ + friend vec operator RELLOGOP(const DataT & Lhs, \ + const vec & Rhs) { \ + return vec(Lhs) RELLOGOP Rhs; \ + } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) + +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_RELLOGOP(RELLOGOP) \ vec operator RELLOGOP(const vec & Rhs) const { \ vec Ret{}; \ @@ -1129,9 +1199,10 @@ template class vec { (std::string_view(#RELLOGOP) == "||" || \ std::string_view(#RELLOGOP) == "&&")) { \ for (size_t I = 0; I < NumElements; ++I) { \ - Ret.setValue(I, \ - -(vec_data::get(getValue(I)) \ - RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ } \ } else { \ Ret = vec( \ @@ -1150,13 +1221,38 @@ template class vec { operator RELLOGOP(const T & Rhs) const { \ return *this RELLOGOP vec(static_cast(Rhs)); \ } +#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES) #else +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_RELLOGOP(RELLOGOP) \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const vec & Rhs) { \ + vec Ret{}; \ + for (size_t I = 0; I < NumElements; ++I) { \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + Lhs.getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + } \ + return Ret; \ + } \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const DataT & Rhs) { \ + return Lhs RELLOGOP vec(Rhs); \ + } \ + friend vec operator RELLOGOP(const DataT & Lhs, \ + const vec & Rhs) { \ + return vec(Lhs) RELLOGOP Rhs; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_RELLOGOP(RELLOGOP) \ vec operator RELLOGOP(const vec & Rhs) const { \ vec Ret{}; \ for (size_t I = 0; I < NumElements; ++I) { \ - Ret.setValue(I, -(vec_data::get(getValue(I)) \ - RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ } \ return Ret; \ } \ @@ -1168,6 +1264,7 @@ template class vec { operator RELLOGOP(const T & Rhs) const { \ return *this RELLOGOP vec(static_cast(Rhs)); \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #endif __SYCL_RELLOGOP(==) @@ -1184,6 +1281,18 @@ template class vec { #ifdef __SYCL_UOP #error "Undefine __SYCL_UOP macro" #endif +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_UOP(UOP, OPASSIGN) \ + friend vec &operator UOP(vec & Rhs) { \ + Rhs OPASSIGN vec_data::get(1); \ + return Rhs; \ + } \ + friend vec operator UOP(vec &Lhs, int) { \ + vec Ret(Lhs); \ + Lhs OPASSIGN vec_data::get(1); \ + return Ret; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_UOP(UOP, OPASSIGN) \ vec &operator UOP() { \ *this OPASSIGN vec_data::get(1); \ @@ -1194,6 +1303,7 @@ template class vec { *this OPASSIGN vec_data::get(1); \ return Ret; \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) __SYCL_UOP(++, +=) __SYCL_UOP(--, -=) @@ -1203,150 +1313,84 @@ template class vec { // operator~() available only when: dataT != float && dataT != double // && dataT != half - template - typename std::enable_if_t> && - (!IsUsingArrayOnDevice && !IsUsingArrayOnHost), - vec> - operator~() const { - vec Ret{(typename vec::DataType) ~m_Data}; - if constexpr (std::is_same_v) { - Ret.ConvertToDataT(); - } - return Ret; - } - template - typename std::enable_if_t> && - (IsUsingArrayOnDevice || IsUsingArrayOnHost), - vec> - operator~() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - Ret.setValue(I, ~getValue(I)); + friend vec operator~(const vec &Rhs) { + if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) { + Ret.setValue(I, ~Rhs.getValue(I)); + } + return Ret; + } else { + vec Ret{(typename vec::DataType) ~Rhs.m_Data}; + if constexpr (std::is_same_v) { + Ret.ConvertToDataT(); + } + return Ret; } - return Ret; } - template -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - using OpNotRet = detail::rel_t; -#else - using OpNotRet = T; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - // operator! - template - EnableIfNotUsingArray, N>> operator!() const { - return vec{(typename vec::DataType) !m_Data} -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - .template as, N>>(); -#else - ; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - } - - // std::byte neither supports ! unary op or casting, so special handling is - // needed. And, worse, Windows has a conflict with 'byte'. + friend vec, NumElements> operator!(const vec &Rhs) { + if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) { #if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) - template - typename std::enable_if_t && - (IsUsingArrayOnDevice || IsUsingArrayOnHost), - vec, N>> - operator!() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - Ret.setValue(I, std::byte{!vec_data::get(getValue(I))}); + // std::byte neither supports ! unary op or casting, so special handling + // is needed. And, worse, Windows has a conflict with 'byte'. + if constexpr (std::is_same_v) { + Ret.setValue(I, std::byte{!vec_data::get(Rhs.getValue(I))}); + } else +#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + { + Ret.setValue(I, !vec_data::get(Rhs.getValue(I))); + } + } + return Ret.template as, NumElements>>(); + } else { + return vec{(typename vec::DataType) !Rhs.m_Data} + .template as, NumElements>>(); } -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - return Ret.template as, N>>(); -#else - return Ret; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES } - template - typename std::enable_if_t && - (IsUsingArrayOnDevice || IsUsingArrayOnHost), - vec, N>> - operator!() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, !vec_data::get(getValue(I))); -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - return Ret.template as, N>>(); -#else - return Ret; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - } -#else - template - EnableIfUsingArray, N>> operator!() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, !vec_data::get(getValue(I))); -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - return Ret.template as, N>>(); -#else - return Ret; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - } -#endif - // operator + - template EnableIfNotUsingArray operator+() const { - return vec{+m_Data}; - } - - template EnableIfUsingArray operator+() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, vec_data::get(+vec_data::get(getValue(I)))); - return Ret; - } - - // operator - - template EnableIfNotUsingArray operator-() const { - namespace oneapi = sycl::ext::oneapi; - if constexpr (IsBfloat16 && NumElements == 1) { - vec Ret{}; - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data); - oneapi::bfloat16 w = -v; - Ret.m_Data = oneapi::detail::bfloat16ToBits(w); - } else if constexpr (IsBfloat16) { + friend vec operator+(const vec &Lhs) { + if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]); - oneapi::bfloat16 w = -v; - Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w); - } + for (size_t I = 0; I < NumElements; ++I) + Ret.setValue( + I, vec_data::get(+vec_data::get(Lhs.getValue(I)))); return Ret; } else { - vec Ret{-m_Data}; - if constexpr (std::is_same_v) { - Ret.ConvertToDataT(); - } - return Ret; + return vec{+Lhs.m_Data}; } } - template EnableIfUsingArray operator-() const { + // operator - + friend vec operator-(const vec &Lhs) { namespace oneapi = sycl::ext::oneapi; vec Ret{}; if constexpr (IsBfloat16 && NumElements == 1) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data); + oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data); oneapi::bfloat16 w = -v; Ret.m_Data = oneapi::detail::bfloat16ToBits(w); } else if constexpr (IsBfloat16) { for (size_t I = 0; I < NumElements; I++) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]); + oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]); oneapi::bfloat16 w = -v; Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w); } - } else { + } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, - vec_data::get(-vec_data::get(getValue(I)))); + Ret.setValue( + I, vec_data::get(-vec_data::get(Lhs.getValue(I)))); + return Ret; + } else { + Ret = vec{-Lhs.m_Data}; + if constexpr (std::is_same_v) { + Ret.ConvertToDataT(); + } + return Ret; } - return Ret; } #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) @@ -1770,6 +1814,19 @@ class SwizzleOp { #ifdef __SYCL_OPASSIGN #error "Undefine __SYCL_OPASSIGN macro." #endif +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_OPASSIGN(OPASSIGN, OP) \ + friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs, const DataT & Rhs) { \ + Lhs.operatorHelper(vec_t(Rhs)); \ + return Lhs; \ + } \ + template \ + friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs, \ + const RhsOperation & Rhs) { \ + Lhs.operatorHelper(Rhs); \ + return Lhs; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_OPASSIGN(OPASSIGN, OP) \ SwizzleOp &operator OPASSIGN(const DataT & Rhs) { \ operatorHelper(vec_t(Rhs)); \ @@ -1780,6 +1837,7 @@ class SwizzleOp { operatorHelper(Rhs); \ return *this; \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) __SYCL_OPASSIGN(+=, std::plus) __SYCL_OPASSIGN(-=, std::minus) @@ -1796,6 +1854,18 @@ class SwizzleOp { #ifdef __SYCL_UOP #error "Undefine __SYCL_UOP macro" #endif +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_UOP(UOP, OPASSIGN) \ + friend SwizzleOp &operator UOP(SwizzleOp & Rhs) { \ + Rhs OPASSIGN static_cast(1); \ + return Rhs; \ + } \ + friend vec_t operator UOP(SwizzleOp &Lhs, int) { \ + vec_t Ret = Lhs; \ + Lhs OPASSIGN static_cast(1); \ + return Ret; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_UOP(UOP, OPASSIGN) \ SwizzleOp &operator UOP() { \ *this OPASSIGN static_cast(1); \ @@ -1806,11 +1876,36 @@ class SwizzleOp { *this OPASSIGN static_cast(1); \ return Ret; \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) __SYCL_UOP(++, +=) __SYCL_UOP(--, -=) #undef __SYCL_UOP +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) + template + friend typename std::enable_if_t< + std::is_same_v && std::is_integral_v>, vec_t> + operator~(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return ~Tmp; + } + + friend vec_rel_t operator!(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return !Tmp; + } + + friend vec_t operator+(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return +Tmp; + } + + friend vec_t operator-(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return -Tmp; + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) template typename std::enable_if_t>, vec_t> operator~() { @@ -1832,6 +1927,80 @@ class SwizzleOp { vec_t Tmp = *this; return -Tmp; } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) + +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +// scalar BINOP vec<> +// scalar BINOP SwizzleOp +// vec<> BINOP SwizzleOp +#ifdef __SYCL_BINOP +#error "Undefine __SYCL_BINOP macro" +#endif +#define __SYCL_BINOP(BINOP) \ + friend vec_t operator BINOP(const DataT &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs BINOP Tmp; \ + } \ + friend vec_t operator BINOP(const SwizzleOp &Lhs, const DataT &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp BINOP Rhs; \ + } \ + friend vec_t operator BINOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs BINOP Tmp; \ + } \ + friend vec_t operator BINOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp BINOP Rhs; \ + } + + __SYCL_BINOP(+) + __SYCL_BINOP(-) + __SYCL_BINOP(*) + __SYCL_BINOP(/) + __SYCL_BINOP(%) + __SYCL_BINOP(&) + __SYCL_BINOP(|) + __SYCL_BINOP(^) + __SYCL_BINOP(>>) + __SYCL_BINOP(<<) +#undef __SYCL_BINOP + +// scalar RELLOGOP vec<> +// scalar RELLOGOP SwizzleOp +// vec<> RELLOGOP SwizzleOp +#ifdef __SYCL_RELLOGOP +#error "Undefine __SYCL_RELLOGOP macro" +#endif +#define __SYCL_RELLOGOP(RELLOGOP) \ + friend vec_rel_t operator RELLOGOP(const DataT &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs RELLOGOP Tmp; \ + } \ + friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const DataT &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp RELLOGOP Rhs; \ + } \ + friend vec_rel_t operator RELLOGOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs RELLOGOP Tmp; \ + } \ + friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp RELLOGOP Rhs; \ + } + + __SYCL_RELLOGOP(==) + __SYCL_RELLOGOP(!=) + __SYCL_RELLOGOP(>) + __SYCL_RELLOGOP(<) + __SYCL_RELLOGOP(>=) + __SYCL_RELLOGOP(<=) + // TODO: limit to integral types. + __SYCL_RELLOGOP(&&) + __SYCL_RELLOGOP(||) +#undef __SYCL_RELLOGOP +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) template > @@ -2265,6 +2434,7 @@ class SwizzleOp { }; } // namespace detail +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) // scalar BINOP vec<> // scalar BINOP SwizzleOp // vec<> BINOP SwizzleOp @@ -2374,6 +2544,7 @@ __SYCL_RELLOGOP(<=) __SYCL_RELLOGOP(&&) __SYCL_RELLOGOP(||) #undef __SYCL_RELLOGOP +#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES) namespace detail { diff --git a/sycl/include/sycl/usm.hpp b/sycl/include/sycl/usm.hpp index f9ab634edc81a..ff941e865dff8 100644 --- a/sycl/include/sycl/usm.hpp +++ b/sycl/include/sycl/usm.hpp @@ -154,6 +154,13 @@ __SYCL_EXPORT void *aligned_alloc( const property_list &propList, const detail::code_location &CodeLoc = detail::code_location::current()); +/// +// Helper function used to determine if the Alignment argument is a power of 2 +/// +inline size_t is_not_power_of_two(size_t Alignment) { + return (Alignment & (Alignment - 1)); +} + /// // Template forms /// @@ -179,6 +186,9 @@ T *aligned_alloc_device( size_t Alignment, size_t Count, const device &Dev, const context &Ctxt, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc_device(max(Alignment, alignof(T)), Count * sizeof(T), Dev, Ctxt, PropList, CodeLoc)); @@ -189,6 +199,9 @@ T *aligned_alloc_device( size_t Alignment, size_t Count, const queue &Q, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc_device(Alignment, Count, Q.get_device(), Q.get_context(), PropList, CodeLoc); } @@ -230,6 +243,9 @@ T *aligned_alloc_host( size_t Alignment, size_t Count, const context &Ctxt, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc_host(std ::max(Alignment, alignof(T)), Count * sizeof(T), Ctxt, PropList, CodeLoc)); @@ -240,6 +256,9 @@ T *aligned_alloc_host( size_t Alignment, size_t Count, const queue &Q, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc_host(Alignment, Count, Q.get_context(), PropList, CodeLoc); } @@ -249,6 +268,9 @@ T *aligned_alloc_shared( size_t Alignment, size_t Count, const device &Dev, const context &Ctxt, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc_shared(max(Alignment, alignof(T)), Count * sizeof(T), Dev, Ctxt, PropList, CodeLoc)); @@ -259,6 +281,9 @@ T *aligned_alloc_shared( size_t Alignment, size_t Count, const queue &Q, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc_shared(Alignment, Count, Q.get_device(), Q.get_context(), PropList, CodeLoc); } @@ -286,6 +311,9 @@ T *aligned_alloc( size_t Alignment, size_t Count, const device &Dev, const context &Ctxt, usm::alloc Kind, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc(max(Alignment, alignof(T)), Count * sizeof(T), Dev, Ctxt, Kind, PropList, CodeLoc)); @@ -296,6 +324,9 @@ T *aligned_alloc( size_t Alignment, size_t Count, const queue &Q, usm::alloc Kind, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc(Alignment, Count, Q.get_device(), Q.get_context(), Kind, PropList, CodeLoc); } diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 6a91815bc890f..2ea2b2f37622a 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 1cd402ead34a54459a6acb85777fbec105a178a0 - # Merge: 5b89ee8b c8e150c5 + # commit 9babc4d092a92c1036791d26ef328e5eeaf19803 + # Merge: 3be8f205 90498ec5 # Author: aarongreig - # Date: Tue Feb 6 14:48:55 2024 +0000 - # Merge pull request #1218 from Bensuo/maxime/imm-cmd-list-support - # [EXP][CMDBUF] L0 Immediate command-list support - set(UNIFIED_RUNTIME_TAG 1cd402ead34a54459a6acb85777fbec105a178a0) + # Date: Thu Feb 8 15:44:54 2024 +0000 + # Merge pull request #1321 from pbalcer/adapter-compute-constructor + # [L0] move adapter init into its constructor from urAdapterGet + set(UNIFIED_RUNTIME_TAG 9babc4d092a92c1036791d26ef328e5eeaf19803) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/source/builtins/common_functions.cpp b/sycl/source/builtins/common_functions.cpp index 09742649ca24d..30d0645f2b517 100644 --- a/sycl/source/builtins/common_functions.cpp +++ b/sycl/source/builtins/common_functions.cpp @@ -63,16 +63,8 @@ BUILTIN_COMMON(TWO_ARGS, max, BUILTIN_COMMON(TWO_ARGS, min, [](auto x, auto y) -> decltype(x) { return (y < x ? y : x); }) -// clamp is implemented for INTEGER_TYPES as well, so expand/inline -// BUILTIN_COMMON manually. -HOST_IMPL(clamp, [](auto x, auto y, auto z) -> decltype(x) { - using ElemTy = detail::get_elem_type_t; - if constexpr (std::is_integral_v) { - return std::min(std::max(x, y), z); - } else { - return std::fmin(std::fmax(x, y), z); - } +BUILTIN_COMMON(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) { + return std::fmin(std::fmax(x, y), z); }) -EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, clamp, INTEGER_TYPES, FP_TYPES) } // namespace _V1 } // namespace sycl diff --git a/sycl/source/builtins/host_helper_macros.hpp b/sycl/source/builtins/host_helper_macros.hpp index 484b0bc95fb8b..41aac2148db71 100644 --- a/sycl/source/builtins/host_helper_macros.hpp +++ b/sycl/source/builtins/host_helper_macros.hpp @@ -56,6 +56,9 @@ #define EXPORT_VEC(NUM_ARGS, NAME, TYPE, VL) \ EXPORT_VEC_NS(NUM_ARGS, NAME, sycl, TYPE, VL) +#define EXPORT_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE) \ + FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE) + #define EXPORT_SCALAR_AND_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE) \ EXPORT_SCALAR_NS(NUM_ARGS, NAME, NS, TYPE) \ FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE) @@ -69,8 +72,12 @@ #define EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...) \ FOR_EACH3(EXPORT_SCALAR_AND_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__) +#define EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...) \ + FOR_EACH3(EXPORT_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__) #define EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, ...) \ EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__) +#define EXPORT_VEC_1_16(NUM_ARGS, NAME, ...) \ + EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__) #define EXPORT_SCALAR_AND_VEC_2_4(NUM_ARGS, NAME, ...) \ FOR_EACH2(EXPORT_SCALAR_AND_VEC_2_4_IMPL, NUM_ARGS, NAME, __VA_ARGS__) diff --git a/sycl/source/builtins/integer_functions.cpp b/sycl/source/builtins/integer_functions.cpp index 26c4dd9a5788f..cd92b2180df73 100644 --- a/sycl/source/builtins/integer_functions.cpp +++ b/sycl/source/builtins/integer_functions.cpp @@ -76,7 +76,8 @@ namespace sycl { inline namespace _V1 { #define BUILTIN_GENINT(NUM_ARGS, NAME, IMPL) \ HOST_IMPL(NAME, IMPL) \ - EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, INTEGER_TYPES) + FOR_EACH2(EXPORT_SCALAR, NUM_ARGS, NAME, INTEGER_TYPES) \ + EXPORT_VEC_1_16(NUM_ARGS, NAME, FIXED_WIDTH_INTEGER_TYPES) #define BUILTIN_GENINT_SU(NUM_ARGS, NAME, IMPL) \ BUILTIN_GENINT(NUM_ARGS, NAME, IMPL) @@ -214,6 +215,10 @@ BUILTIN_GENINT_SU(TWO_ARGS, max, BUILTIN_GENINT_SU(TWO_ARGS, min, [](auto x, auto y) -> decltype(x) { return y < x ? y : x; }) +BUILTIN_GENINT_SU(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) { + return std::min(std::max(x, y), z); +}) + template static inline constexpr T __clz_impl(T x, T m, T n = 0) { return (x & m) ? n : __clz_impl(x, T(m >> 1), ++n); } diff --git a/sycl/source/builtins/relational_functions.cpp b/sycl/source/builtins/relational_functions.cpp index b54c55e283e5e..b8b7795f6fb79 100644 --- a/sycl/source/builtins/relational_functions.cpp +++ b/sycl/source/builtins/relational_functions.cpp @@ -103,6 +103,7 @@ HOST_IMPL(bitselect, [](auto x, auto y, auto z) { assert((ures & std::numeric_limits::max()) == ures); return bit_cast(static_cast(ures)); }) -EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES) +FOR_EACH2(EXPORT_SCALAR, THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES) +EXPORT_VEC_1_16(THREE_ARGS, bitselect, FIXED_WIDTH_INTEGER_TYPES, FP_TYPES) } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/allowlist.cpp b/sycl/source/detail/allowlist.cpp index 881a014c4831f..83309ec9f2d92 100644 --- a/sycl/source/detail/allowlist.cpp +++ b/sycl/source/detail/allowlist.cpp @@ -166,7 +166,8 @@ AllowListParsedT parseAllowList(const std::string &AllowListRaw) { // valid. E.g., for BackendName key, the allowed values are only ones // described in SyclBeMap ValidateEnumValues(BackendNameKeyName, getSyclBeMap()); - ValidateEnumValues(DeviceTypeKeyName, getSyclDeviceTypeMap()); + ValidateEnumValues(DeviceTypeKeyName, + getSyclDeviceTypeMap(true /*Enable 'acc'*/)); if (Key == DeviceVendorIdKeyName) { // DeviceVendorId should have hex format @@ -380,7 +381,8 @@ void applyAllowList(std::vector &PiDevices, Device, PI_DEVICE_INFO_TYPE, sizeof(sycl::detail::pi::PiDeviceType), &PiDevType, nullptr); sycl::info::device_type DeviceType = pi::cast(PiDevType); - for (const auto &SyclDeviceType : getSyclDeviceTypeMap()) { + for (const auto &SyclDeviceType : + getSyclDeviceTypeMap(true /*Enable 'acc'*/)) { if (SyclDeviceType.second == DeviceType) { const auto &DeviceTypeValue = SyclDeviceType.first; DeviceDesc[DeviceTypeKeyName] = DeviceTypeValue; diff --git a/sycl/source/detail/config.cpp b/sycl/source/detail/config.cpp index f7760aa227168..7ae96d42e220d 100644 --- a/sycl/source/detail/config.cpp +++ b/sycl/source/detail/config.cpp @@ -165,14 +165,16 @@ void dumpConfig() { // TODO: host device type will be removed once sycl_ext_oneapi_filter_selector // is removed. const std::array, 6> & -getSyclDeviceTypeMap() { +getSyclDeviceTypeMap(bool supportAcc) { static const std::array, 6> - SyclDeviceTypeMap = {{{"host", info::device_type::host}, - {"cpu", info::device_type::cpu}, - {"gpu", info::device_type::gpu}, - {"acc", info::device_type::accelerator}, - {"fpga", info::device_type::accelerator}, - {"*", info::device_type::all}}}; + SyclDeviceTypeMap = { + {{"host", info::device_type::host}, + {"cpu", info::device_type::cpu}, + {"gpu", info::device_type::gpu}, + /* Duplicate entries are fine as this map is one-directional.*/ + {supportAcc ? "acc" : "fpga", info::device_type::accelerator}, + {"fpga", info::device_type::accelerator}, + {"*", info::device_type::all}}}; return SyclDeviceTypeMap; } diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index 8f048e0f95f60..1079f32caa388 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -232,8 +232,10 @@ template <> class SYCLConfig { }; // Array is used by SYCL_DEVICE_ALLOWLIST and ONEAPI_DEVICE_SELECTOR. +// The 'supportAcc' parameter is used by SYCL_DEVICE_ALLOWLIST which, +// unlike ONEAPI_DEVICE_SELECTOR, also accepts 'acc' as a valid device type. const std::array, 6> & -getSyclDeviceTypeMap(); +getSyclDeviceTypeMap(bool supportAcc = false); // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST and // ONEAPI_DEVICE_SELECTOR @@ -514,7 +516,7 @@ template <> class SYCLConfig { return Result; std::string ValueStr{ValueRaw}; - auto DeviceTypeMap = getSyclDeviceTypeMap(); + auto DeviceTypeMap = getSyclDeviceTypeMap(true /*Enable 'acc'*/); // Iterate over all configurations. size_t Start = 0, End = 0; diff --git a/sycl/source/detail/device_filter.cpp b/sycl/source/detail/device_filter.cpp index 311ebeaa174b8..eb3d0f83ed26e 100644 --- a/sycl/source/detail/device_filter.cpp +++ b/sycl/source/detail/device_filter.cpp @@ -93,9 +93,13 @@ static void Parse_ODS_Device(ods_target &Target, std::string_view TopDeviceStr = DeviceSubTuple[0]; // Handle explicit device type (e.g. 'gpu'). - auto DeviceTypeMap = - getSyclDeviceTypeMap(); // <-- std::array> + auto DeviceTypeMap = getSyclDeviceTypeMap( +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + true /*Enable 'acc'*/ +#endif + ); // <-- std::array> + auto It = std::find_if(std::begin(DeviceTypeMap), std::end(DeviceTypeMap), [&](auto DtPair) { return TopDeviceStr == DtPair.first; }); @@ -262,7 +266,11 @@ Parse_ONEAPI_DEVICE_SELECTOR(const std::string &envString) { std::ostream &operator<<(std::ostream &Out, const ods_target &Target) { Out << Target.Backend; if (Target.DeviceType) { - auto DeviceTypeMap = getSyclDeviceTypeMap(); + auto DeviceTypeMap = getSyclDeviceTypeMap( +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + true /*Enable 'acc'*/ +#endif + ); auto Match = std::find_if( DeviceTypeMap.begin(), DeviceTypeMap.end(), [&](auto Pair) { return (Pair.second == Target.DeviceType); }); @@ -335,11 +343,12 @@ device_filter::device_filter(const std::string &FilterString) { if (TripleValueID >= Tokens.size()) { DeviceType = info::device_type::all; } else { - auto Iter = std::find_if(std::begin(getSyclDeviceTypeMap()), - std::end(getSyclDeviceTypeMap()), FindElement); + auto Iter = std::find_if( + std::begin(getSyclDeviceTypeMap(true /*Enable 'acc'*/)), + std::end(getSyclDeviceTypeMap(true /*Enable 'acc'*/)), FindElement); // If no match is found, set device_type 'all', // which actually means 'any device_type' will be a match. - if (Iter == getSyclDeviceTypeMap().end()) + if (Iter == getSyclDeviceTypeMap(true /*Enable 'acc'*/).end()) DeviceType = info::device_type::all; else { DeviceType = Iter->second; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index ba50f0562ff56..a24905d4214da 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -342,7 +342,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, // handler rather than by-passing the scheduler. if (!MGraph.lock() && areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) { - if (MHasDiscardEventsSupport) { + if (MSupportsDiscardingPiEvents) { MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); return createDiscardedEvent(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index ddd6a71d7db80..0fe4242cc9472 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -115,8 +115,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && + (MHostQueue ? true : MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { if (has_property()) { @@ -292,8 +292,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && + (MHostQueue ? true : MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { queue_impl_interop(PiQueue); @@ -317,8 +317,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport(MDiscardEvents && - (MHostQueue ? true : MIsInorder)) { + MSupportsDiscardingPiEvents(MDiscardEvents && + (MHostQueue ? true : MIsInorder)) { queue_impl_interop(PiQueue); } @@ -374,7 +374,9 @@ class queue_impl { bool is_host() const { return MHostQueue; } /// \return true if this queue has discard_events support. - bool has_discard_events_support() const { return MHasDiscardEventsSupport; } + bool supportsDiscardingPiEvents() const { + return MSupportsDiscardingPiEvents; + } bool isInOrder() const { return MIsInorder; } @@ -970,12 +972,11 @@ class queue_impl { const bool MIsProfilingEnabled; protected: - // This flag says if we can discard events based on a queue "setup" which will - // be common for all operations submitted to the queue. This is a must - // condition for discarding, but even if it's true, in some cases, we won't be - // able to discard events, because the final decision is made right before the - // operation itself. - const bool MHasDiscardEventsSupport; + // Indicates whether the queue supports discarding PI events for tasks + // submitted to it. This condition is necessary but not sufficient, PI events + // should be discarded only if they also don't represent potential implicit + // dependencies for future tasks in other queues. + const bool MSupportsDiscardingPiEvents; // Command graph which is associated with this queue for the purposes of // recording commands to it. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 23b6eec33886e..955adae8423dc 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2704,7 +2704,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { } sycl::detail::pi::PiEvent *Event = - (MQueue->has_discard_events_support() && + (MQueue->supportsDiscardingPiEvents() && MCommandGroup->getRequirements().size() == 0) ? nullptr : &MEvent->getHandleRef(); @@ -2851,11 +2851,11 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { auto RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); - bool DiscardEvent = (MQueue->has_discard_events_support() && - MCommandGroup->getRequirements().size() == 0); + bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() && + MCommandGroup->getRequirements().size() == 0); sycl::detail::pi::PiEvent *Event = - DiscardEvent ? nullptr : &MEvent->getHandleRef(); - detail::EventImplPtr EventImpl = DiscardEvent ? nullptr : MEvent; + DiscardPiEvent ? nullptr : &MEvent->getHandleRef(); + detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent; switch (MCommandGroup->getType()) { diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 02ffef951d1b5..70681a7504358 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -292,7 +292,7 @@ event handler::finalize() { }; bool DiscardEvent = false; - if (MQueue->has_discard_events_support()) { + if (MQueue->supportsDiscardingPiEvents()) { // Kernel only uses assert if it's non interop one bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) && diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp index 8fad623752873..ea572bfc4c5d7 100644 --- a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp +++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// +// REQUIRES: aspect-usm_shared_allocations #include "common.hpp" diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp index 7e4a35d09b994..c478fd96afb34 100644 --- a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp +++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// +// REQUIRES: aspect-usm_shared_allocations #include "common.hpp" diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp index 812a0bb115737..8be1db398a1a1 100644 --- a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp +++ b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp @@ -124,26 +124,27 @@ template void testUsmKind(sycl::queue &q) { [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_host}); }, [&]() { return ATHost(1, q); }, [&]() { return ATHost(1, Ctx); }, [&]() { return ATAnnotated(1, dev, Ctx, alloc::host); }}); - - CheckUsmKindAll( - alloc::shared, - std::tuple{ - [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); }, - [&]() { return MAnnotated(dev, Ctx, alloc::shared); }, - [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); }, - [&]() { return AShared(1, q); }, - [&]() { return AShared(1, dev, Ctx); }, - [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); }, - [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); }, - [&]() { return TAnnotated(dev, Ctx, alloc::shared); }, - [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); }, - [&]() { return ATShared(1, q); }, - [&]() { return ATShared(1, dev, Ctx); }, - [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }}); + if (dev.has(sycl::aspect::usm_shared_allocations)) { + CheckUsmKindAll( + alloc::shared, + std::tuple{ + [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); }, + [&]() { return MAnnotated(dev, Ctx, alloc::shared); }, + [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); }, + [&]() { return AShared(1, q); }, + [&]() { return AShared(1, dev, Ctx); }, + [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); }, + [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); }, + [&]() { return TAnnotated(dev, Ctx, alloc::shared); }, + [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); }, + [&]() { return ATShared(1, q); }, + [&]() { return ATShared(1, dev, Ctx); }, + [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }}); + } } int main() { sycl::queue q; testUsmKind(q); return 0; -} \ No newline at end of file +} diff --git a/sycl/test-e2e/Basic/group_local_memory.cpp b/sycl/test-e2e/Basic/group_local_memory.cpp index 8185378071c75..b5bb26917ec0d 100644 --- a/sycl/test-e2e/Basic/group_local_memory.cpp +++ b/sycl/test-e2e/Basic/group_local_memory.cpp @@ -7,17 +7,21 @@ constexpr int N = 5; int main() { sycl::queue q; - int *ptr = sycl::malloc_shared(N, q); - q.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) { - auto g = it.get_group(); - auto mem = sycl::ext::oneapi::group_local_memory(g, 1, 2, 3, 4, 5); - auto ref = *mem; - for (int i = 0; i < N; ++i) { - ptr[i] = ref[i]; - } - }).wait(); + sycl::buffer buf{sycl::range{N}}; + q.submit([&](sycl::handler &h) { + sycl::accessor acc{buf, h}; + h.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) { + auto g = it.get_group(); + auto mem = + sycl::ext::oneapi::group_local_memory(g, 1, 2, 3, 4, 5); + auto ref = *mem; + for (int i = 0; i < N; ++i) { + acc[i] = ref[i]; + } + }); + }); + sycl::host_accessor result{buf}; for (int i = 0; i < N; ++i) { - assert(ptr[i] == (i + 1)); + assert(result[i] == (i + 1)); } - sycl::free(ptr, q); } diff --git a/sycl/test-e2e/Basic/large-range.cpp b/sycl/test-e2e/Basic/large-range.cpp index 354276065a739..f2c38cbb5dd8d 100644 --- a/sycl/test-e2e/Basic/large-range.cpp +++ b/sycl/test-e2e/Basic/large-range.cpp @@ -26,15 +26,18 @@ void check_sum(std::string_view desc, const ContainerT &data, size_t N) { template void test_regular(std::string_view desc, queue &q, size_t B, RangeT range) { auto N = range.size(); - std::vector accumulators_v(B, 0, usm_allocator(q)); - auto *accumulators = accumulators_v.data(); - - q.parallel_for(range, [=](auto it) { - atomic_ref ref( - accumulators[it.get_linear_id() % B]); - ++ref; - }).wait(); - + std::vector accumulators_v(B, 0); + { + sycl::buffer accumulator_buf{accumulators_v}; + q.submit([&](sycl::handler &h) { + sycl::accessor accumulators{accumulator_buf, h}; + h.parallel_for(range, [=](auto it) { + atomic_ref ref( + accumulators[it.get_linear_id() % B]); + ++ref; + }); + }); + } // destruction of accumulator_buf here writes back data to accumulators_v check_sum(desc, accumulators_v, N); } @@ -42,18 +45,19 @@ template void test_spec_constant(std::string_view desc, queue &q, size_t B, RangeT range) { auto N = range.size(); - std::vector accumulators_v(B, 0, usm_allocator(q)); - auto *accumulators = accumulators_v.data(); - - q.submit([&](handler &cgh) { - cgh.set_specialization_constant(2); - cgh.parallel_for(range, [=](auto it, kernel_handler h) { - atomic_ref ref( - accumulators[it.get_linear_id() % B]); - ref += h.get_specialization_constant(); - }); - }).wait(); - + std::vector accumulators_v(B, 0); + { + sycl::buffer accumulators_buf{accumulators_v}; + q.submit([&](handler &cgh) { + sycl::accessor accumulators{accumulators_buf, cgh}; + cgh.set_specialization_constant(2); + cgh.parallel_for(range, [=](auto it, kernel_handler h) { + atomic_ref ref( + accumulators[it.get_linear_id() % B]); + ref += h.get_specialization_constant(); + }); + }); + } // destruction of accumulators_buf here writes data back to accumulators_v check_sum(desc, accumulators_v, N * 2); } diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp index 43d16ff4d16d6..ec341d39d0eac 100644 --- a/sycl/test-e2e/Basic/span.cpp +++ b/sycl/test-e2e/Basic/span.cpp @@ -3,7 +3,7 @@ // // Fails to release USM pointer on HIP for NVIDIA // XFAIL: hip_nvidia - +// REQUIRES: aspect-usm_shared_allocations #include #include diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp new file mode 100644 index 0000000000000..158b52ab5f27e --- /dev/null +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp @@ -0,0 +1,150 @@ +// REQUIRES: preview-breaking-changes-supported +// RUN: %{build} -fpreview-breaking-changes -o %t.out +// RUN: %{run} %t.out + +// This test currently fails on AMD HIP due to an unresolved memcmp function. +// XFAIL: hip_amd + +// Checks scalar/vec operator ordering. + +#include + +template +using rel_t = std::conditional_t< + sizeof(T) == 1, int8_t, + std::conditional_t< + sizeof(T) == 2, int16_t, + std::conditional_t>>>; + +template +bool CheckResult(sycl::vec V, T2 Ref) { + if constexpr (IsRelOp) { + // Check that all elements have the same boolean representation as the + // scalar. + for (size_t I = 0; I < N; ++I) + if (static_cast(V[I]) != static_cast(Ref)) + return false; + return true; + } else { + // Check that all elements are equal to the scalar. + for (size_t I = 0; I < N; ++I) + if (V[I] != Ref) + return false; + return true; + } +} + +#define CHECK(Q, C, T, N, IS_RELOP, OP) \ + { \ + using VecT = sycl::vec; \ + using ResT = sycl::vec, T>, N>; \ + constexpr T RefVal = 2; \ + VecT InVec{static_cast(RefVal)}; \ + { \ + VecT OutVecsDevice[2]; \ + T OutRefsDevice[2]; \ + { \ + sycl::buffer OutVecsBuff{OutVecsDevice, 2}; \ + sycl::buffer OutRefsBuff{OutRefsDevice, 2}; \ + Q.submit([&](sycl::handler &CGH) { \ + sycl::accessor OutVecsAcc{OutVecsBuff, CGH, sycl::read_write}; \ + sycl::accessor OutRefsAcc{OutRefsBuff, CGH, sycl::read_write}; \ + CGH.single_task([=]() { \ + auto OutVec1 = InVec OP RefVal; \ + auto OutVec2 = RefVal OP InVec; \ + static_assert(std::is_same_v); \ + static_assert(std::is_same_v); \ + OutVecsAcc[0] = OutVec1; \ + OutVecsAcc[1] = OutVec2; \ + OutRefsAcc[0] = RefVal OP RefVal; \ + OutRefsAcc[1] = RefVal OP RefVal; \ + }); \ + }); \ + } \ + if (!CheckResult(OutVecsDevice[0], OutRefsDevice[0])) { \ + std::cout << ("Check of vector " #OP \ + " scalar from device failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + if (!CheckResult(OutVecsDevice[1], OutRefsDevice[1])) { \ + std::cout << ("Check of scalar " #OP \ + " vector from device failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + } \ + { \ + auto OutVec1 = InVec OP RefVal; \ + auto OutVec2 = RefVal OP InVec; \ + static_assert(std::is_same_v); \ + static_assert(std::is_same_v); \ + if (!CheckResult(OutVec1, RefVal OP RefVal)) { \ + std::cout << ("Check of vector " #OP \ + " scalar from host failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + if (!CheckResult(OutVec2, RefVal OP RefVal)) { \ + std::cout << ("Check of scalar " #OP \ + " vector from host failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + } \ + } + +#define CHECK_SIZES(Q, C, T, IS_RELOP, OP) \ + CHECK(Q, C, T, 1, IS_RELOP, OP) \ + CHECK(Q, C, T, 2, IS_RELOP, OP) \ + CHECK(Q, C, T, 4, IS_RELOP, OP) \ + CHECK(Q, C, T, 8, IS_RELOP, OP) \ + CHECK(Q, C, T, 16, IS_RELOP, OP) + +// NOTE: For the sake of compile-time we pick only a few operators per category. +#define CHECK_SIZES_AND_COMMON_OPS(Q, C, T) \ + CHECK_SIZES(Q, Failures, T, false, *) \ + CHECK_SIZES(Q, Failures, T, true, &&) \ + CHECK_SIZES(Q, Failures, T, true, ==) \ + CHECK_SIZES(Q, Failures, T, true, <) \ + CHECK_SIZES(Q, Failures, T, true, >=) +#define CHECK_SIZES_AND_INT_ONLY_OPS(Q, C, T) \ + CHECK_SIZES(Q, Failures, T, false, %) \ + CHECK_SIZES(Q, Failures, T, false, >>) \ + CHECK_SIZES(Q, Failures, T, false, ^) + +int main() { + sycl::queue Q; + int Failures = 0; + + // Check operators on types with requirements if they are supported. + if (Q.get_device().has(sycl::aspect::fp16)) { + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, sycl::half); + } + if (Q.get_device().has(sycl::aspect::fp64)) { + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, double); + } + + // Check all operators without requirements. + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, float); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int8_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int16_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int32_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int64_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint8_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint16_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint32_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint64_t); + + // Check integer only operators. + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int8_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int16_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int32_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int64_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint8_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint16_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint32_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint64_t); + return Failures; +} diff --git a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp index 7b81493db6dce..d2fabb6f7e967 100644 --- a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp +++ b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out - +// REQUIRES: aspect-usm_shared_allocations //==---------- wrapped_usm_pointer.cpp - test pointers in struct ---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp index 9bb5122ffc7f7..1178bdf0b5ffe 100644 --- a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp @@ -13,31 +13,26 @@ bool operator()(sycl::queue &Q, cmplx init, \ cmplx ref = cmplx(0, 0), bool use_ref = false) { \ bool pass = true; \ - \ auto std_in = init_std_complex(init.re, init.im); \ experimental::complex cplx_input{init.re, init.im}; \ - \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ /*Get std::complex output*/ \ std::complex std_out{ref.re, ref.im}; \ if (!use_ref) \ std_out = std::math_func(std_in); \ - \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(cplx_input); \ - }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(cplx_input); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(cplx_input); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(cplx_input); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -73,8 +68,7 @@ TEST_MATH_OP_TYPE(tanh) \ auto std_in = init_std_complex(init.re, init.im); \ experimental::complex cplx_input{init.re, init.im}; \ - \ - auto *cplx_out = sycl::malloc_shared(1, Q); \ + sycl::buffer cplx_out_buf{sycl::range{1}}; \ \ /*Get std::complex output*/ \ T std_out = ref.re; \ @@ -82,19 +76,18 @@ TEST_MATH_OP_TYPE(tanh) std_out = std::math_func(std_in); \ \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(cplx_input); \ - }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(cplx_input); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(cplx_input); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(cplx_input); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -121,23 +114,21 @@ TEST_MATH_OP_TYPE(imag) std::complex std_out = ref; \ if (!use_ref) \ std_out = std::math_func(std_in); \ - \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(std_in); \ - }).wait(); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(std_in); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(std_in); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(std_in); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -161,23 +152,21 @@ TEST_MATH_OP_TYPE(proj) T std_out = ref; \ if (!use_ref) \ std_out = std::math_func(std_in); \ - \ - auto *cplx_out = sycl::malloc_shared(1, Q); \ - \ + sycl::buffer cplx_out_buf{sycl::range{1}}; \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(init); \ - }).wait(); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(std_in); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(init); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(init); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -197,26 +186,25 @@ template struct test_polar { bool use_ref = false) { bool pass = true; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range(1)}; /*Get std::complex output*/ std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::polar(init.re, init.im); /*Check cplx::complex output from device*/ - Q.single_task([=]() { - cplx_out[0] = experimental::polar(init.re, init.im); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task( + [=]() { cplx_out[0] = experimental::polar(init.re, init.im); }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); /*Check cplx::complex output from host*/ - cplx_out[0] = experimental::polar(init.re, init.im); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); + cplx_out_acc[0] = experimental::polar(init.re, init.im); - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } diff --git a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp index ed68332f60bdd..8894f5d965462 100644 --- a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp @@ -15,23 +15,19 @@ experimental::complex cplx_input1{init_re1, init_im1}; \ experimental::complex cplx_input2{init_re2, init_im2}; \ \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ std::complex std_out; \ std_out = std_in1 op std_in2; \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task([=]() { cplx_out[0] = cplx_input1 op cplx_input2; }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ - Q.single_task([=]() { \ - cplx_out[0] = cplx_input1 op cplx_input2; \ - }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ - \ - cplx_out[0] = cplx_input1 op cplx_input2; \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = cplx_input1 op cplx_input2; \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -53,28 +49,32 @@ test_op(test_div, /); experimental::complex cplx_input{init_re1, init_im1}; \ \ auto std_inout = init_std_complex(init_re2, init_im2); \ - auto *cplx_inout = sycl::malloc_shared>(1, Q); \ - cplx_inout[0].real(init_re2); \ - cplx_inout[0].imag(init_im2); \ - \ + experimental::complex cplx; \ + cplx.real(init_re2); \ + cplx.imag(init_im2); \ + sycl::buffer> cplx_out_buf{&cplx, \ + sycl::range{1}}; \ std_inout op_assign std_in; \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task([=]() { cplx_out[0] op_assign cplx_input; }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ \ - Q.single_task([=]() { cplx_inout[0] op_assign cplx_input; }).wait(); \ + pass &= \ + check_results(cplx_out_acc[0], \ + std::complex(std_inout.real(), std_inout.imag()), \ + /*is_device*/ true); \ \ - pass &= check_results( \ - cplx_inout[0], std::complex(std_inout.real(), std_inout.imag()), \ - /*is_device*/ true); \ + cplx_out_acc[0].real(init_re2); \ + cplx_out_acc[0].imag(init_im2); \ \ - cplx_inout[0].real(init_re2); \ - cplx_inout[0].imag(init_im2); \ + cplx_out_acc[0] op_assign cplx_input; \ \ - cplx_inout[0] op_assign cplx_input; \ - \ - pass &= check_results( \ - cplx_inout[0], std::complex(std_inout.real(), std_inout.imag()), \ - /*is_device*/ false); \ - \ - sycl::free(cplx_inout, Q); \ + pass &= \ + check_results(cplx_out_acc[0], \ + std::complex(std_inout.real(), std_inout.imag()), \ + /*is_device*/ false); \ \ return pass; \ } \ @@ -99,19 +99,18 @@ test_op_assign(test_div_assign, /=); experimental::complex cplx_input{init_re1, init_im1}; \ \ std::complex std_out{}; \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ std_out = op std_in; \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task([=]() { cplx_out[0] = op cplx_input; }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ - Q.single_task([=]() { cplx_out[0] = op cplx_input; }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ - \ - cplx_out[0] = op cplx_input; \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ + cplx_out_acc[0] = op cplx_input; \ \ - sycl::free(cplx_out, Q); \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ \ return pass; \ } \ diff --git a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp index c8555687eb49b..91d58a4cdeebb 100644 --- a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp @@ -51,26 +51,26 @@ template struct test_pow_cplx_cplx { experimental::complex cplx_input1{init1.re, init1.im}; experimental::complex cplx_input2{init2.re, init2.im}; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range{1}}; // Get std::complex output std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::pow(std_in1, std_in2); // Check cplx::complex output from device - Q.single_task([=]() { - cplx_out[0] = experimental::pow(cplx_input1, cplx_input2); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task([=]() { + cplx_out[0] = experimental::pow(cplx_input1, cplx_input2); + }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); // Check cplx::complex output from host - cplx_out[0] = experimental::pow(cplx_input1, cplx_input2); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); + cplx_out_acc[0] = experimental::pow(cplx_input1, cplx_input2); - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } @@ -87,26 +87,25 @@ template struct test_pow_cplx_deci { experimental::complex cplx_input{init1.re, init1.im}; T deci_input = init2.re; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range{1}}; // Get std::complex output std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::pow(std_in, std_deci_in); // Check cplx::complex output from device - Q.single_task([=]() { - cplx_out[0] = experimental::pow(cplx_input, deci_input); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task( + [=]() { cplx_out[0] = experimental::pow(cplx_input, deci_input); }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); // Check cplx::complex output from host - cplx_out[0] = experimental::pow(cplx_input, deci_input); + cplx_out_acc[0] = experimental::pow(cplx_input, deci_input); - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); - - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } @@ -123,26 +122,24 @@ template struct test_pow_deci_cplx { experimental::complex cplx_input{init2.re, init2.im}; T deci_input = init1.re; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range{1}}; // Get std::complex output std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::pow(std_deci_in, std_in); // Check cplx::complex output from device - Q.single_task([=]() { - cplx_out[0] = experimental::pow(deci_input, cplx_input); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); - + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task( + [=]() { cplx_out[0] = experimental::pow(deci_input, cplx_input); }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); // Check cplx::complex output from host - cplx_out[0] = experimental::pow(deci_input, cplx_input); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); + cplx_out_acc[0] = experimental::pow(deci_input, cplx_input); - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } diff --git a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp index b9cb8c519366d..d8d645f6ac7dc 100644 --- a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp @@ -7,17 +7,15 @@ template struct test_sycl_stream_operator { bool operator()(sycl::queue &Q, cmplx init) { - auto *cplx_out = sycl::malloc_shared>(1, Q); - cplx_out[0] = experimental::complex(init.re, init.im); - + experimental::complex cplx(init.re, init.im); + sycl::buffer> cplx_out_buf{&cplx, sycl::range{1}}; Q.submit([&](sycl::handler &CGH) { + sycl::accessor cplx_out{cplx_out_buf, CGH}; sycl::stream Out(512, 20, CGH); CGH.parallel_for<>(sycl::range<1>(1), [=](sycl::id<1> idx) { Out << cplx_out[idx] << sycl::endl; }); }).wait(); - - sycl::free(cplx_out, Q); return true; } }; diff --git a/sycl/test-e2e/ESIMD/aot_mixed.cpp b/sycl/test-e2e/ESIMD/aot_mixed.cpp index 32c10969ee51b..5204d413e69ad 100644 --- a/sycl/test-e2e/ESIMD/aot_mixed.cpp +++ b/sycl/test-e2e/ESIMD/aot_mixed.cpp @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// // TODO: Enable on other GPUs once internal ticket is fixed // REQUIRES: ocloc && gpu-intel-gen12 -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s // RUN: %{run} %t.sycl.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.out %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.out %s // RUN: %{run} %t.out // This test checks the following ESIMD ahead-of-time compilation scenarios: diff --git a/sycl/test-e2e/ESIMD/api/ballot.cpp b/sycl/test-e2e/ESIMD/api/ballot.cpp index a636bcee147e0..4f87e562972db 100644 --- a/sycl/test-e2e/ESIMD/api/ballot.cpp +++ b/sycl/test-e2e/ESIMD/api/ballot.cpp @@ -71,15 +71,13 @@ template bool test(queue &Q) { template bool test(queue &Q) { bool Pass = true; - // TODO: uncomment calls below once simd<...>.copy_from() starts supporting - // sizes other than 8, 16 and 32. - // Pass &= test(Q); + Pass &= test(Q); Pass &= test(Q); - // Pass &= test(Q); + Pass &= test(Q); Pass &= test(Q); - // Pass &= test(Q); - // Pass &= test(Q); - // Pass &= test(Q); + Pass &= test(Q); + Pass &= test(Q); + Pass &= test(Q); Pass &= test(Q); return Pass; diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp index 3d50a0915afe1..4c9e528646545 100644 --- a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp +++ b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp @@ -94,7 +94,6 @@ template struct DpasNaturalOperandType { static constexpr bool is_bf16 = T == dpas_argument_type::bf16; static constexpr bool is_tf32 = T == dpas_argument_type::tf32; - // TODO: support tf32 here. using type = std::conditional_t< is_sint, signed char, std::conditional_t< @@ -149,7 +148,7 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col, ElemT *Vec = reinterpret_cast(VVec); // 1. Find and read the target 'unsigned int' element. - // THe unpacked matrix has dimensions: NumRows*NumCols + // The unpacked matrix dimensions are NumRows*NumCols. constexpr int ElemsInElemT = sizeof(ElemT) * 8 / ElemBitSize; int UnpackedLinearIndex = Row * NumCols + Col; int PackedLinearIndex = UnpackedLinearIndex / ElemsInElemT; @@ -160,7 +159,6 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col, } else { ElemT TargetElem = Vec[PackedLinearIndex]; // TargetElem has 2 or more elements in it. Need to extract one. - // TODO: for now assume that is the case only for 2 or 4-bit integers. assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type"); unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize; @@ -196,7 +194,6 @@ ReadT readFromHorizontallyPackedMatrix(void *VVec, int Row, int Col) { return static_cast(TargetElem); } else { // TargetElem has 2 or more elements in it. Need to extract one. - // TODO: for now assume that is the case only for 2 or 4-bit integers. assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type"); unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize; unsigned int Mask = (static_cast(1) << ElemBitSize) - 1; diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 755c436f0b329..8ea1fcf4a08ad 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -230,3 +230,222 @@ template bool testUSM(queue Q) { return Passed; } + +template +bool testSLM(queue Q, uint32_t MaskStride, + ScatterPropertiesT ScatterProperties) { + constexpr uint32_t Groups = 8; + constexpr uint32_t Threads = 1; + constexpr size_t Size = Groups * Threads * N; + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "SLM case: T=" << esimd_test::type_name() << ",N=" << N + << ", VS=" << VS << ",UseMask=" << UseMask + << ",UseProperties=" << UseProperties << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = static_cast(sycl::malloc_shared(Size * sizeof(T), Q)); + for (size_t i = 0; i < Size; i++) + Out[i] = i; + + try { + Q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + ScatterPropertiesT Props{}; + uint16_t GlobalID = ndi.get_global_id(0); + uint16_t LocalID = ndi.get_local_id(0); + uint32_t GlobalElemOffset = GlobalID * N; + uint32_t LocalElemOffset = LocalID * N; + + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + slm_init(); + + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd InVec(Out + GlobalElemOffset + I); + simd Offsets(I * sizeof(T), sizeof(T)); + slm_scatter(Offsets, InVec); + } + } + barrier(); + + simd ByteOffsets(LocalElemOffset * sizeof(T), + VS * sizeof(T)); + auto ByteOffsetsView = ByteOffsets.template select(); + + simd Vals = slm_gather(ByteOffsets, Props); + Vals *= 2; + + auto ValsView = Vals.template select(); + simd_mask Pred = 0; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView); + } + } + } else { // VS == 1 + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView); + } + } + } + barrier(); + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I++) { + simd Offsets(I * sizeof(T), sizeof(T)); + simd OutVec = slm_gather(Offsets); + OutVec.copy_to(Out + GlobalElemOffset + I); + } + } + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask); + + sycl::free(Out, Q); + + return Passed; +} + +template bool testSLM(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties EmptyProps; + properties AlignElemProps{alignment}; + + bool Passed = true; + + // Test scatter() that is available on Gen12 and PVC. + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 1, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 1, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + + // // Test scatter() without passing compile-time properties argument. + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{alignment}; + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + + Passed &= testSLM(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= + // testSLM(Q, 2, AlignElemProps) + Passed &= + testSLM(Q, 2, AlignElemProps); + Passed &= + testSLM(Q, 2, AlignElemProps); + Passed &= + testSLM(Q, 2, AlignElemProps); + } + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp index 929d3c6fc04f7..ff331a421ccef 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------------===// -// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out // RUN: %{run} %t.out // The test verifies esimd::scatter() functions accepting USM pointer // and optional compile-time esimd::properties. -// The scatter() calls in this test do not use cache-hint -// properties to not impose using DG2/PVC features. +// The scatter() calls in this test do not use cache-hint properties +// or VS > 1 (number of stores per offset) to not impose using PVC features. #include "Inputs/scatter.hpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp new file mode 100644 index 0000000000000..08ac29ba6b605 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp @@ -0,0 +1,21 @@ +//==------- scatter_usm_legacy.cpp - DPC++ ESIMD on-device test -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The scatter() calls in this test do not use cache-hint properties +// or VS > 1 (number of stores per offset) to not impose using PVC features. +// +// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is +// resolved and ESIMD starts using llvm.masked.scatter by default. +// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here. + +#include "scatter_usm.cpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp new file mode 100644 index 0000000000000..ffa0a718e7689 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp @@ -0,0 +1,33 @@ +//==------- slm_scatter.cpp - DPC++ ESIMD on-device test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting optional +// compile-time esimd::properties. The slm_scatter() calls in this test do not +// use VS > 1 (number of stores per offset) to not impose using PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp new file mode 100644 index 0000000000000..4ebe20e376cf1 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp @@ -0,0 +1,37 @@ +//==------- slm_scatter_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting +// optional compile-time esimd::properties. +// The slm_scatter() calls in this test use DG2 or PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::PVC; + bool Passed = true; + + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testSLM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp new file mode 100644 index 0000000000000..0aca53311ef6b --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp @@ -0,0 +1,20 @@ +//==------- slm_scatter_legacy.cpp - DPC++ ESIMD on-device test -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting optional +// compile-time esimd::properties. The slm_scatter() calls in this test do not +// use VS > 1 (number of stores per offset) to not impose using PVC features. +// +// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is +// resolved and ESIMD starts using llvm.masked.scatter by default. +// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here. + +#include "slm_scatter.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp index cf17e3b6c3dba..b39f3b4fc889b 100644 --- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp index 6e76ec42c079e..00940dcd6d68c 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp index 040ce8c30c38b..fd8fcec4b89d2 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp index bb12ff6006004..5855fc25ed4cc 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp index f989a27a1ca39..3be8febbe008e 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp index 7f7501175899c..301c53aa69795 100644 --- a/sycl/test-e2e/Graph/Explicit/empty_node.cpp +++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task.cpp b/sycl/test-e2e/Graph/Explicit/host_task.cpp index 167219bf4b8ef..62bcd167e484f 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task2.cpp b/sycl/test-e2e/Graph/Explicit/host_task2.cpp index f7d59f48aac09..78be02a7bc991 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task2.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task2.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp index aed2916284cc1..26dcf148d6f7d 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp index 3d03b7f6013ad..32c7157d0958c 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp index 6fc8744e04c18..b2236b9a0d91b 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp index da26b8b182296..2147d075715af 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/memadvise.cpp b/sycl/test-e2e/Graph/Explicit/memadvise.cpp index 11e0d6296290f..1a8313478ddb5 100644 --- a/sycl/test-e2e/Graph/Explicit/memadvise.cpp +++ b/sycl/test-e2e/Graph/Explicit/memadvise.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // Mem advise command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Mem advise is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp index a83775b8ecb8a..e9c76ad01113b 100644 --- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/prefetch.cpp b/sycl/test-e2e/Graph/Explicit/prefetch.cpp index 8e0e6e15c292d..4b670ebc89357 100644 --- a/sycl/test-e2e/Graph/Explicit/prefetch.cpp +++ b/sycl/test-e2e/Graph/Explicit/prefetch.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // prefetch command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Prefetch is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp index a4f3710eea9f0..0760b5805e6d9 100644 --- a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp +++ b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp @@ -4,6 +4,11 @@ // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %} // // CHECK-NOT: LEAK +// +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp index a2589cda54320..85d3b106bc3cd 100644 --- a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp +++ b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp index 8b1f6ba3d4455..935f431b09332 100644 --- a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp +++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp index 830cc5a42a668..c60aa2f63e34f 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp index bc3cb1c8f352d..772c07599300f 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp index 92add086ee20d..17cdde9b87956 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp index 05ecfbd982a01..7fd72c79722d4 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp index 8fa8e0c334de2..d43e7e221e68c 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp index 323d3e35935ac..6b60911d4627e 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp @@ -7,7 +7,10 @@ // // // USM copy command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp index 86076768f5d32..966809cf5a8d3 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp index e0100d42d7267..53feb52cad798 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp index e9bb9ebab0fa9..50821fd8d008b 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp @@ -6,7 +6,10 @@ // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // // USM memset command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp index ba81666277e44..c0729ebb3a015 100644 --- a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp @@ -7,7 +7,10 @@ // // // Temporarily disabled until failure is addressed. -// UNSUPPORTED: windows +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: windows, gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp index 767ca8b425ab7..4a87a09e0fbce 100644 --- a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 // This test attempts recording a set of kernels after they have already been // executed once before. diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp index c12038ba46185..951a016994261 100644 --- a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp index ffddf70716250..bcdfb702b78c7 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp index a90931934f0ce..b71fd0a690220 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp index 410850f5ff7a8..74bc18a7b3496 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp index 2f19c2706380b..51e4597354473 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp index 70128db712f78..2155b226e2fc3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp index 582a3fb3f14ff..4f4d50fe3b3f3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp index f947a13ffd63d..cd867eb4caadd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp index e40fe3a0e963c..c3037c78d8eaf 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 // This test uses a host_task when adding a command_graph node to an // in-order queue. diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp index c8dfa02c63ea7..64d625197f089 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp index 7e312df58d092..a8ff9b53637cd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp index 6d86cef96c862..4e2730592ef95 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp index 51da588ab9444..e5570cb61a2d3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp index a5b87f8143832..1f7989f9a0361 100644 --- a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // Mem advise command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Mem advise is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp index 67b5335de4383..0709d25e225bd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp index dca297f7772b3..a5a18c08c6b66 100644 --- a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // prefetch command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Prefetch is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp index aa81923251cb6..a88e462461a04 100644 --- a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp @@ -4,6 +4,11 @@ // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %} // // CHECK-NOT: LEAK +// +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp index 99a28bd745853..d5e9520417a71 100644 --- a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp index badf7293f49cb..9e2416541e091 100644 --- a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp index 2096a7c7a21f7..37e29e16edf40 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp index f1aeef51f1c31..b7442d24c6f1d 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp index d26f99a34c290..57b4c1cd3abdd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp index f2b7ca35a77d7..e78794733dc42 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp index a1d0373290cc6..8c3c5f0980c55 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp index 09486c4ebc8da..1fdf104310623 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp @@ -5,9 +5,11 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// // USM copy command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp index 8e89ecc693ed7..66a9589d9e242 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp index 401cdc7aa430f..ed959ca5a400b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp index 137b3df19cf02..86c533110f26b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp @@ -6,7 +6,10 @@ // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // USM memset command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp index ff4984603d908..c574d65e9357f 100644 --- a/sycl/test-e2e/Graph/empty_graph.cpp +++ b/sycl/test-e2e/Graph/empty_graph.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 // Tests the ability to finalize and submit a command graph which doesn't // contain any nodes. diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp index 935df8cab7afa..a16b903c09a84 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp index cf21d2cd84714..ea7434dd31498 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp index 50f63b777d647..58978e203d8a8 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp index 053c48e53e353..98d9b1ff1520d 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp index 4754d32a57b77..47f0fd98311ae 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp index cf7ab9f6d8a6e..395eb4af68c03 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp index 4a24d6dbd441e..e36a15cf1cbf4 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp index ec6f1026d9bea..53b64cf2ba2fd 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp index b76357962f0fc..0ce42082fe2ef 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp index 3b2d8aa1353ee..86151cdc7cab7 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp index 71ca270f231cd..78321716777ac 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { C[wiID] = 43; #if defined(__SYCL_DEVICE_ONLY__) asm volatile(""); @@ -30,7 +30,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 43)) diff --git a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp index c28616ee70750..00c3eb3830f38 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { volatile int output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output)); @@ -33,7 +33,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp index c28616ee70750..00c3eb3830f38 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { volatile int output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output)); @@ -33,7 +33,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp index 11d647cdfbe1d..812e30a7c9f56 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { for (int i = 0; i < 10; ++i) { #if defined(__SYCL_DEVICE_ONLY__) asm("fence_sw"); @@ -35,7 +35,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 45)) diff --git a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp index d43b9cd83ac81..26d0952225901 100644 --- a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp +++ b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-8 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] { C[wiID] = 43; #if defined(__SYCL_DEVICE_ONLY__) asm volatile(""); @@ -30,7 +30,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f, true, false, {8})) + if (!launchInlineASMTest(f, {8})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 43)) diff --git a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp index ab15d58164e3f..e671acc825e7b 100644 --- a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp +++ b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-8 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] { volatile int output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" : "=rw"(output)); @@ -33,7 +33,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f, true, false, {8})) + if (!launchInlineASMTest(f, {8})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp index 401dfbcacb63c..739feed41d6c9 100644 --- a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp +++ b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -33,7 +33,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mad (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0> %3(0, " "0)<1;1,0>" @@ -56,7 +56,7 @@ int main() { } KernelFunctor<> f(inputA, inputB, inputC); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &D = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp index dd7cbc6739816..60d200d1c99df 100644 --- a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp +++ b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { // declaration of temp within and outside the scope #if defined(__SYCL_DEVICE_ONLY__) asm("{\n" @@ -59,7 +59,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &C = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_float_add.cpp b/sycl/test-e2e/InlineAsm/asm_float_add.cpp index bd4298b0ddad3..b6374c96d2010 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_add.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_add.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -30,7 +30,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" : "=rw"(C[wiID]) @@ -51,7 +51,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &C = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp index 51cfa81149118..948d983554c4e 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -27,7 +27,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" : "=rw"(B[wiID]) @@ -45,7 +45,7 @@ int main() { input[i] = (float)1 / std::pow(2, i); KernelFunctor<> f(input); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &B = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp index 39176ab355c96..43ad56e41222d 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mov (M1, 16) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>" : "=rw"(B[wiID]) @@ -45,7 +45,7 @@ int main() { input[i] = 1.0 / i; KernelFunctor<> f(input); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &R = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_if.cpp b/sycl/test-e2e/InlineAsm/asm_if.cpp index 9cee76efee2e0..679980a62aaf7 100644 --- a/sycl/test-e2e/InlineAsm/asm_if.cpp +++ b/sycl/test-e2e/InlineAsm/asm_if.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -18,7 +18,7 @@ template struct KernelFunctor : WithOutputBuffer { bool switchField = false; CGH.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { int Output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("{\n" @@ -42,7 +42,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(Functor)) + if (!launchInlineASMTest(Functor, {16})) return 0; if (verify_all_the_same(Functor.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp index 48415e1445af0..d2a53f14691da 100644 --- a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -26,7 +26,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" : "=rw"(B[wiID]) @@ -44,7 +44,7 @@ int main() { input[i] = i; KernelFunctor<> f(input); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &B = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_loop.cpp b/sycl/test-e2e/InlineAsm/asm_loop.cpp index 0c1cc06568d6a..56518dffbf4e3 100644 --- a/sycl/test-e2e/InlineAsm/asm_loop.cpp +++ b/sycl/test-e2e/InlineAsm/asm_loop.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { CGH); CGH.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile("{\n" ".decl P1 v_type=P num_elts=16\n" @@ -67,7 +67,7 @@ int main() { } KernelFunctor<> Functor(InputA, InputB); - if (!launchInlineASMTest(Functor)) + if (!launchInlineASMTest(Functor, {16})) return 0; auto &C = Functor.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_mul.cpp b/sycl/test-e2e/InlineAsm/asm_mul.cpp index 3f2b91c53cf59..b038a79f8abb1 100644 --- a/sycl/test-e2e/InlineAsm/asm_mul.cpp +++ b/sycl/test-e2e/InlineAsm/asm_mul.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -28,7 +28,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" : "=rw"(C[wiID]) @@ -49,7 +49,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &C = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp index f51f5f58501ca..cdcf08b0f809d 100644 --- a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp +++ b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -DTO_PASS -o %t.out.pass // RUN: %{run} %t.out.pass // RUN: %{build} -o %t.out @@ -36,7 +36,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(TO_PASS) // The code below passing verification volatile int output = -1; @@ -85,7 +85,7 @@ int main() { } KernelFunctor<> f(inputA, inputB, inputC); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), diff --git a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp index 5037fd68af41c..5b770bb5baa0d 100644 --- a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp +++ b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -12,10 +12,7 @@ int main() { sycl::queue Queue; sycl::device Device = Queue.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -25,12 +22,13 @@ int main() { // Submitting command group(work) to queue Queue.submit([&](sycl::handler &cgh) { // Executing kernel - cgh.parallel_for( - NumOfWorkItems, - [=](sycl::id<1> WIid) [[intel::reqd_sub_group_size(16)]] { + // clang-format off + cgh.parallel_for(NumOfWorkItems, + [=](sycl::id<1> WIid) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) - asm("barrier"); + asm("barrier"); #endif - }); + }); + // clang-format on }); } diff --git a/sycl/test-e2e/InlineAsm/asm_no_output.cpp b/sycl/test-e2e/InlineAsm/asm_no_output.cpp index a6d44b35a63dd..0622f4f8edb38 100644 --- a/sycl/test-e2e/InlineAsm/asm_no_output.cpp +++ b/sycl/test-e2e/InlineAsm/asm_no_output.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { volatile int local_var = 47; local_var += C[0]; #if defined(__SYCL_DEVICE_ONLY__) @@ -37,7 +37,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 0)) diff --git a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp index 10157938517bf..db2659d8e7077 100644 --- a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp +++ b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>" : "+rw"(B[wiID]) @@ -47,7 +47,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &B = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_switch.cpp b/sycl/test-e2e/InlineAsm/asm_switch.cpp index e77a7e6edc531..ccd81fd34af45 100644 --- a/sycl/test-e2e/InlineAsm/asm_switch.cpp +++ b/sycl/test-e2e/InlineAsm/asm_switch.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -18,7 +18,7 @@ template struct KernelFunctor : WithOutputBuffer { int switchField = 2; CGH.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { int Output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("{\n" @@ -62,7 +62,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(Functor)) + if (!launchInlineASMTest(Functor, {16})) return 0; if (verify_all_the_same(Functor.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/include/asmhelper.h b/sycl/test-e2e/InlineAsm/include/asmhelper.h index f96d056dcfec7..abc7d24668691 100644 --- a/sycl/test-e2e/InlineAsm/include/asmhelper.h +++ b/sycl/test-e2e/InlineAsm/include/asmhelper.h @@ -91,8 +91,8 @@ auto exception_handler = [](sycl::exception_list exceptions) { }; template -bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true, - std::vector RequiredSGSizes = {}) { +bool launchInlineASMTestImpl(F &f, + const std::vector &RequiredSGSizes = {}) { sycl::queue deviceQueue(sycl::gpu_selector_v, exception_handler); sycl::device device = deviceQueue.get_device(); @@ -101,22 +101,19 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true, return false; } - auto Vec = device.get_info(); - if (requires_particular_sg_size && - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { - std::cout << "Skipping test\n"; - return false; - } - - auto sg_sizes = device.get_info(); - if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(), - [&](size_t RequiredSGSize) { - return std::find(sg_sizes.begin(), sg_sizes.end(), - RequiredSGSize) == sg_sizes.end(); - })) { - std::cout << "Skipping test\n"; - return false; + if (!RequiredSGSizes.empty()) { + auto supported_sg_sizes = + device.get_info(); + if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(), + [&](size_t RequiredSGSize) { + return std::find(supported_sg_sizes.begin(), + supported_sg_sizes.end(), + RequiredSGSize) == + supported_sg_sizes.end(); + })) { + std::cout << "Skipping test\n"; + return false; + } } deviceQueue.submit(f).wait_and_throw(); @@ -128,13 +125,11 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true, /// /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise template -bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true, - bool exception_expected = false, - std::vector RequiredSGSizes = {}) { +bool launchInlineASMTest(F &f, const std::vector &RequiredSGSizes = {}, + bool exception_expected = false) { bool result = false; try { - result = launchInlineASMTestImpl(f, requires_particular_sg_size, - RequiredSGSizes); + result = launchInlineASMTestImpl(f, RequiredSGSizes); } catch (sycl::exception &e) { std::string what = e.what(); if (exception_expected && diff --git a/sycl/test-e2e/InlineAsm/letter_example.cpp b/sycl/test-e2e/InlineAsm/letter_example.cpp index 393c362c33fe0..c09c6d9f75ae4 100644 --- a/sycl/test-e2e/InlineAsm/letter_example.cpp +++ b/sycl/test-e2e/InlineAsm/letter_example.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -15,10 +15,7 @@ int main() { sycl::queue q; sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -31,7 +28,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) int i = idx[0]; asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n" diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp index be2ee81836261..c541ac4d17838 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-32,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -16,10 +16,7 @@ int main() { sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -40,7 +37,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(32)]] { int i = idx[0]; #if defined(__SYCL_DEVICE_ONLY__) asm volatile(R"a( diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp index 0470bb2431e9a..5ea597d5d1064 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -16,10 +16,7 @@ int main() { sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -37,7 +34,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] { int i = idx[0]; volatile int tmp = a[i]; tmp += 1; diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp index 76ad3f6e95260..3da4e4c7e7e71 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -15,10 +15,7 @@ int main() { sycl::queue q; sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -31,7 +28,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] { int i = idx[0]; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" : "=rw"(a[i])); diff --git a/sycl/test-e2e/USM/align.cpp b/sycl/test-e2e/USM/align.cpp new file mode 100755 index 0000000000000..9659ec78b9a88 --- /dev/null +++ b/sycl/test-e2e/USM/align.cpp @@ -0,0 +1,102 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// UNSUPPORTED: gpu + +// E2E tests for annotated USM allocation functions with alignment arguments +// that are not powers of 2. Note this test does not work on gpu because some +// tests expect non-templated aligned_alloc_xxx functions to return nullptr, +// e.g. when the alignment argument is not a power of 2, while they fail to do +// so when run on gpu. This maybe because the gpu runtime has different +// behavior. Therefore, GPU is unsupported until issue #12638 gets resolved. + +#include + +#include +#include + +using namespace sycl; +using namespace ext::oneapi::experimental; +using namespace ext::intel::experimental; +using alloc = usm::alloc; + +template void testAlign(sycl::queue &q, unsigned align) { + const sycl::context &Ctx = q.get_context(); + auto dev = q.get_device(); + + constexpr int N = 10; + assert(align > 0 || (align & (align - 1)) == 0); + + auto ADevice = [&](size_t align, auto... args) { + return aligned_alloc_device(align, N, args...); + }; + auto AHost = [&](size_t align, auto... args) { + return aligned_alloc_host(align, N, args...); + }; + auto AShared = [&](size_t align, auto... args) { + return aligned_alloc_shared(align, N, args...); + }; + auto AAnnotated = [&](size_t align, auto... args) { + return aligned_alloc(align, N, args...); + }; + + auto ATDevice = [&](size_t align, auto... args) { + return aligned_alloc_device(align, N, args...); + }; + auto ATHost = [&](size_t align, auto... args) { + return aligned_alloc_host(align, N, args...); + }; + auto ATShared = [&](size_t align, auto... args) { + return aligned_alloc_shared(align, N, args...); + }; + auto ATAnnotated = [&](size_t align, auto... args) { + return aligned_alloc(align, N, args...); + }; + + // Test cases that are expected to return null + auto check_null = [&q](auto AllocFn, int Line, int Case) { + decltype(AllocFn()) Ptr = AllocFn(); + if (Ptr != nullptr) { + free(Ptr, q); + std::cout << "Failed at line " << Line << ", case " << Case << std::endl; + assert(false && "The return is not null!"); + } + }; + + auto CheckNullAll = [&](auto Funcs, int Line = __builtin_LINE()) { + std::apply( + [&](auto... Fs) { + int Case = 0; + (void)std::initializer_list{ + (check_null(Fs, Line, Case++), 0)...}; + }, + Funcs); + }; + + CheckNullAll(std::tuple{ + // Case: aligned_alloc_xxx with no alignment property, and the alignment + // argument is not a power of 2, the result is nullptr + [&]() { return ADevice(3, q); }, [&]() { return ADevice(5, dev, Ctx); }, + [&]() { return AHost(7, q); }, [&]() { return AHost(9, Ctx); }, + [&]() { return AShared(114, q); }, + [&]() { return AShared(1023, dev, Ctx); }, + [&]() { return AAnnotated(15, q, alloc::device); }, + [&]() { return AAnnotated(17, dev, Ctx, alloc::host); } + // Case: aligned_alloc_xxx with no alignment property, and the + // alignment argument is not a power of 2, the result is nullptr + , + [&]() { return ATDevice(3, q); }, [&]() { return ATDevice(5, dev, Ctx); }, + [&]() { return ATHost(7, q); }, [&]() { return ATHost(9, Ctx); }, + [&]() { return ATShared(1919, q); }, + [&]() { return ATShared(11, dev, Ctx); }, + [&]() { return ATAnnotated(15, q, alloc::device); }, + [&]() { return ATAnnotated(17, dev, Ctx, alloc::host); }}); +} + +int main() { + sycl::queue q; + testAlign(q, 4); + testAlign(q, 128); + testAlign>(q, 4); + return 0; +} diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index a25f775edc9e1..ef44dc972797b 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -669,7 +669,7 @@ features.update(sg_size_features) be, dev = sycl_device.split(":") - features.add(dev.replace("acc", "accelerator")) + features.add(dev.replace("fpga", "accelerator")) # Use short names for LIT rules. features.add(be) diff --git a/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp new file mode 100644 index 0000000000000..d241a90568fc3 --- /dev/null +++ b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp @@ -0,0 +1,133 @@ +// RUN: %clangxx -fsycl -fpreview-breaking-changes -fsyntax-only %s -Xclang -verify +// REQUIRES: preview-breaking-changes-supported + +#include + +using namespace sycl; +using namespace sycl::detail; + +namespace builtin_same_shape_v_tests { +using swizzle1 = decltype(std::declval>().swizzle<0>()); +using swizzle2 = decltype(std::declval>().swizzle<0, 0>()); +using swizzle3 = decltype(std::declval>().swizzle<0, 0, 1>()); + +static_assert(builtin_same_shape_v); +static_assert(builtin_same_shape_v); +static_assert(builtin_same_shape_v>); +static_assert(builtin_same_shape_v, marray>); +static_assert(builtin_same_shape_v>); +static_assert(builtin_same_shape_v, vec>); +static_assert(builtin_same_shape_v, swizzle2>); + +static_assert(!builtin_same_shape_v>); +static_assert(!builtin_same_shape_v>); +static_assert(!builtin_same_shape_v, vec>); +static_assert(!builtin_same_shape_v); +static_assert(!builtin_same_shape_v, swizzle1>); +static_assert(!builtin_same_shape_v); +} // namespace builtin_same_shape_v_tests + +namespace builtin_marray_impl_tests { +// Integer functions/relational bitselect only accept fixed-width integer +// element types for vector/swizzle elements. Make sure that our marray->vec +// delegator can handle that. + +auto foo(char x) { return x; } +auto foo(signed char x) { return x; } +auto foo(unsigned char x) { return x; } +auto foo(vec x) { return x; } +auto foo(vec x) { return x; } + +auto test() { + marray x; + marray y; + marray z; + auto TestOne = [](auto x) { + std::ignore = builtin_marray_impl([](auto x) { return foo(x); }, x); + }; + TestOne(x); + TestOne(y); + TestOne(z); +} +} // namespace builtin_marray_impl_tests + +namespace builtin_enable_integer_tests { +using swizzle1 = decltype(std::declval>().swizzle<0>()); +using swizzle2 = decltype(std::declval>().swizzle<0, 0>()); +template void ignore() {} + +void test() { + // clang-format off + ignore, + builtin_enable_integer_t, + builtin_enable_integer_t>(); + // clang-format on + + ignore>, + builtin_enable_integer_t>>(); + + ignore>(); + ignore, vec>>(); + ignore, swizzle2>>(); + ignore>(); + + { + // Only one of char/signed char maps onto int8_t. The other type isn't a + // valid vector element type for integer builtins. + + static_assert(std::is_signed_v); + + // clang-format off + // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable>'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}} + ignore>, builtin_enable_integer_t>>(); + // clang-format on + } + + // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}} + ignore>(); +} +} // namespace builtin_enable_integer_tests + +namespace builtin_enable_bitselect_tests { +// Essentially the same as builtin_enable_integer_t + FP types support. +using swizzle1 = decltype(std::declval>().swizzle<0>()); +using swizzle2 = decltype(std::declval>().swizzle<0, 0>()); +template void ignore() {} + +void test() { + // clang-format off + ignore, + builtin_enable_bitselect_t, + builtin_enable_bitselect_t, + builtin_enable_bitselect_t>(); + // clang-format on + + ignore>, + builtin_enable_bitselect_t>, + builtin_enable_bitselect_t>>(); + + ignore>(); + ignore, vec>>(); + ignore, swizzle2>>(); + ignore>(); + + { + // Only one of char/signed char maps onto int8_t. The other type isn't a + // valid vector element type for integer builtins. + + static_assert(std::is_signed_v); + + // clang-format off + // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable>'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}} + ignore>, builtin_enable_bitselect_t>>(); + // clang-format on + } + + // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}} + ignore>(); +} +} // namespace builtin_enable_bitselect_tests diff --git a/sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp b/sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp similarity index 100% rename from sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp rename to sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp diff --git a/sycl/test/basic_tests/builtins_templates.cpp b/sycl/test/basic_tests/builtins/builtins_templates.cpp similarity index 100% rename from sycl/test/basic_tests/builtins_templates.cpp rename to sycl/test/basic_tests/builtins/builtins_templates.cpp diff --git a/sycl/test/basic_tests/relational_builtins.cpp b/sycl/test/basic_tests/builtins/relational_builtins.cpp similarity index 96% rename from sycl/test/basic_tests/relational_builtins.cpp rename to sycl/test/basic_tests/builtins/relational_builtins.cpp index ab9f3aa5c4326..eabd7e26c35cb 100644 --- a/sycl/test/basic_tests/relational_builtins.cpp +++ b/sycl/test/basic_tests/builtins/relational_builtins.cpp @@ -4,7 +4,7 @@ // NOTE: Compile the test fully to ensure the library exports the right host // symbols. -#include +#include // Some helper macros to verify return type of the builtins. To be used like // this @@ -19,15 +19,15 @@ template struct CheckHelper { template static auto call(F f) { return f(Args()...); } }; -#define CHECK(EXPECTED, FUNC, ...) \ +#define CHECK(EXPECTED, FUNC, ...) \ { \ auto ret = CheckHelper<__VA_ARGS__>::call( \ - [](auto... args) { return cl::sycl::FUNC(args...); }); \ - static_assert(std::is_same_v); \ + [](auto... args) { return sycl::FUNC(args...); }); \ + static_assert(std::is_same_v); \ } void foo() { - using namespace cl::sycl; + using namespace sycl; using boolm = marray; using int16v = vec; @@ -247,9 +247,9 @@ void foo() { } int main() { - cl::sycl::queue q; + sycl::queue q; foo(); // Verify host. - q.submit([&](cl::sycl::handler &cgh) { + q.submit([&](sycl::handler &cgh) { cgh.single_task([]() { foo(); // verify device }); diff --git a/sycl/test/basic_tests/types.cpp b/sycl/test/basic_tests/types.cpp index 6aab1e433c7a7..14a1070567274 100644 --- a/sycl/test/basic_tests/types.cpp +++ b/sycl/test/basic_tests/types.cpp @@ -134,12 +134,13 @@ template inline void checkVecNotReturnType() { using Vector = sycl::vec; #if defined(__INTEL_PREVIEW_BREAKING_CHANGES) using ExpectedVector = sycl::vec; + using OpNotResult = decltype(operator!(std::declval())); #else using ExpectedVector = sycl::vec; -#endif using OpNotResult = decltype(std::declval().operator!()); +#endif static_assert(std::is_same_v, - "Incorrect vec::operator! return type"); + "Incorrect operator! return type"); } // the math built-in testing ensures that the vec binary ops get tested, diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 3f653adb5e476..8305bd9b83b18 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1224,7 +1224,7 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, acc_res = gather(local_acc, ioffset_n32, 0); acc_res = gather(local_acc, ioffset_n32, 0, mask_n32); - // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p4(<32 x float> {{[^)]+}}, <32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) scatter(ptrf, ioffset_n32, usm, mask_n32); scatter(ptrf, ioffset_n32, usm); @@ -1281,6 +1281,14 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, scatter(ptrf, ioffset_n16_view, usm_view, mask_n16); scatter(ptrf, ioffset_n16_view, usm_view); + + simd ioffset_n10(byte_offset32, 8); + simd usm_n10; + + // Check special case to verify that for cases when N is not power of 2 llvm + // intrinsic is used + // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) + scatter(ptrf, ioffset_n10, usm_n10); } // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}} @@ -1302,6 +1310,7 @@ test_slm_gather_scatter(int byte_offset32) { simd slm; simd pass_thru; auto pass_thru_view = pass_thru.select<32, 1>(); + auto slm_view = slm.select<32, 1>(); // Test SLM gather using this plan: // 1) slm_gather(offsets): offsets is simd or simd_view @@ -1373,4 +1382,66 @@ test_slm_gather_scatter(int byte_offset32) { props_align4); slm = slm_gather(ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + + // Test SLM scatter using this plan: + // 1) slm_scatter(offsets, vals): offsets/vals is simd or simd_view + // 2) slm_scatter(offsets, vals, mask): offsets/vals is simd or simd_view + // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1. + + // 1) slm_scatter(offsets): offsets is simd or simd_view + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + slm_scatter(ioffset_n32, slm); + slm_scatter(ioffset_n32_view, slm); + slm_scatter(ioffset_n32, slm_view); + slm_scatter(ioffset_n32_view, slm_view); + + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, props_align8); + slm_scatter(ioffset_n32_view, slm, props_align8); + slm_scatter(ioffset_n32, slm_view, props_align8); + slm_scatter(ioffset_n32_view, slm_view, props_align8); + + // 2) slm_gather(offsets, mask): offsets is simd or simd_view + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, mask_n32); + slm_scatter(ioffset_n32_view, slm, mask_n32); + slm_scatter(ioffset_n32, slm_view, mask_n32); + slm_scatter(ioffset_n32_view, slm_view, mask_n32); + + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, mask_n32, props_align8); + slm_scatter(ioffset_n32_view, slm, mask_n32, props_align8); + slm_scatter(ioffset_n32, slm_view, mask_n32, props_align8); + slm_scatter(ioffset_n32_view, slm_view, mask_n32, props_align8); + + // 4) slm_gather(...): same as (1), (2), above, but with VS > 1. + // CHECK-COUNT-16: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + // 4a) check VS > 1. no 'mask' operand first. + slm_scatter(ioffset_n16, slm); + slm_scatter(ioffset_n16_view, slm); + slm_scatter(ioffset_n16, slm_view); + slm_scatter(ioffset_n16_view, slm_view); + + slm_scatter(ioffset_n16, slm, props_align4); + slm_scatter(ioffset_n16_view, slm, props_align4); + slm_scatter(ioffset_n16, slm_view, props_align4); + slm_scatter(ioffset_n16_view, slm_view, props_align4); + + // 4b) check VS > 1. Pass the 'mask' operand this time. + slm_scatter(ioffset_n16, slm, mask_n16); + slm_scatter(ioffset_n16_view, slm, mask_n16); + slm_scatter(ioffset_n16, slm_view, mask_n16); + slm_scatter(ioffset_n16_view, slm_view, mask_n16); + + slm_scatter(ioffset_n16, slm, mask_n16, props_align4); + slm_scatter(ioffset_n16_view, slm, mask_n16, props_align4); + slm_scatter(ioffset_n16, slm_view, mask_n16, props_align4); + slm_scatter(ioffset_n16_view, slm_view, mask_n16, props_align4); + + simd ioffset_n10(byte_offset32, 8); + simd usm_n10; + // Check special case to verify that for cases when N is not power of 2 llvm + // intrinsic is used + // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p3(<10 x float> {{[^)]+}}, <10 x ptr addrspace(3)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) + slm_scatter(ioffset_n10, usm_n10); } diff --git a/sycl/unittests/allowlist/ParseAllowList.cpp b/sycl/unittests/allowlist/ParseAllowList.cpp index faecab30aaeaf..40fbceb76616e 100644 --- a/sycl/unittests/allowlist/ParseAllowList.cpp +++ b/sycl/unittests/allowlist/ParseAllowList.cpp @@ -178,7 +178,8 @@ TEST(ParseAllowListTests, CheckAllValidBackendNameValuesAreProcessed) { TEST(ParseAllowListTests, CheckAllValidDeviceTypeValuesAreProcessed) { std::string AllowList; - for (const auto &SyclDeviceType : sycl::detail::getSyclDeviceTypeMap()) { + for (const auto &SyclDeviceType : + sycl::detail::getSyclDeviceTypeMap(true /*Enable 'acc'*/)) { if (!AllowList.empty()) AllowList += "|"; AllowList += "DeviceType:" + SyclDeviceType.first;