From 3ebffba28433d3f3e548f83d5d0b0083ce2b6ffc Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 12 Feb 2024 10:37:59 -0800 Subject: [PATCH 1/6] [NFCI][SYCL] Introduce convertToOpenCLType helper (#12674) This is probably just the first patch in a series of refactoring PRs I'm envisioning. Ultimate goal is to align this helper and `sycl::detail::builtins::convert_arg` from `builtins_preview.hpp`, possibly with the latter delegating to the former. --- sycl/include/sycl/detail/builtins.hpp | 60 ++++++------- .../sycl/detail/generic_type_traits.hpp | 7 ++ sycl/include/sycl/detail/image_ocl_types.hpp | 52 +++++------- sycl/include/sycl/detail/spirv.hpp | 84 ++++++++----------- .../sycl/ext/oneapi/experimental/builtins.hpp | 8 +- 5 files changed, 98 insertions(+), 113 deletions(-) diff --git a/sycl/include/sycl/detail/builtins.hpp b/sycl/include/sycl/detail/builtins.hpp index 69bde7c7b0d93..d0d76cc1c6925 100644 --- a/sycl/include/sycl/detail/builtins.hpp +++ b/sycl/include/sycl/detail/builtins.hpp @@ -18,20 +18,27 @@ #ifdef __SYCL_DEVICE_ONLY__ #define __FUNC_PREFIX_OCL __spirv_ocl_ #define __FUNC_PREFIX_CORE __spirv_ -#define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1) -#define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2) -#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg) +#define __SYCL_EXTERN_IT1(Ret, prefix, call, arg1) +#define __SYCL_EXTERN_IT2(Ret, prefix, call, arg1, arg2) +#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, arg) #define __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3) #else #define __FUNC_PREFIX_OCL sycl_host_ #define __FUNC_PREFIX_CORE sycl_host_ -#define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg) \ +#define __SYCL_EXTERN_IT1(Ret, prefix, call, arg) \ + using Arg = decltype(arg); \ extern Ret __SYCL_PPCAT(prefix, call)(Arg) -#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg) \ +#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, arg) \ + using Arg = decltype(arg); \ extern Ret __SYCL_PPCAT(prefix, call)(Arg, Arg) -#define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2) \ +#define __SYCL_EXTERN_IT2(Ret, prefix, call, arg1, arg2) \ + using Arg1 = decltype(arg1); \ + using Arg2 = decltype(arg2); \ extern Ret __SYCL_PPCAT(prefix, call)(Arg1, Arg2) -#define __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3) \ +#define __SYCL_EXTERN_IT3(Ret, prefix, call, arg1, arg2, arg3) \ + using Arg1 = decltype(arg1); \ + using Arg2 = decltype(arg2); \ + using Arg3 = decltype(arg3); \ extern Ret __SYCL_PPCAT(prefix, call)(Arg1, Arg2, Arg3) #endif @@ -41,10 +48,9 @@ #define __SYCL_MAKE_CALL_ARG1(call, prefix) \ template \ inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1) __NOEXC { \ + auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ using Ret = sycl::detail::ConvertToOpenCLType_t; \ - using Arg1 = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1); \ - Arg1 arg1 = sycl::detail::convertDataToType(std::move(t1)); \ + __SYCL_EXTERN_IT1(Ret, prefix, call, arg1); \ Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1)); \ return sycl::detail::convertDataToType(std::move(ret)); \ } @@ -52,12 +58,10 @@ #define __SYCL_MAKE_CALL_ARG2(call, prefix) \ template \ inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1, T2 t2) __NOEXC { \ + auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ + auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2)); \ using Ret = sycl::detail::ConvertToOpenCLType_t; \ - using Arg1 = sycl::detail::ConvertToOpenCLType_t; \ - using Arg2 = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2); \ - Arg1 arg1 = sycl::detail::convertDataToType(std::move(t1)); \ - Arg2 arg2 = sycl::detail::convertDataToType(std::move(t2)); \ + __SYCL_EXTERN_IT2(Ret, prefix, call, arg1, arg2); \ Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)); \ return sycl::detail::convertDataToType(std::move(ret)); \ } @@ -65,11 +69,10 @@ #define __SYCL_MAKE_CALL_ARG2_SAME(call, prefix) \ template \ inline __SYCL_ALWAYS_INLINE R __invoke_##call(T t1, T t2) __NOEXC { \ + auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ + auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2)); \ using Ret = sycl::detail::ConvertToOpenCLType_t; \ - using Arg = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg); \ - Arg arg1 = sycl::detail::convertDataToType(std::move(t1)); \ - Arg arg2 = sycl::detail::convertDataToType(std::move(t2)); \ + __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, arg1); \ Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)); \ return sycl::detail::convertDataToType(std::move(ret)); \ } @@ -77,10 +80,10 @@ #define __SYCL_MAKE_CALL_ARG2_SAME_RESULT(call, prefix) \ template \ inline __SYCL_ALWAYS_INLINE T __invoke_##call(T v1, T v2) __NOEXC { \ - using Type = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT2_SAME(Type, prefix, call, Type); \ - Type arg1 = sycl::detail::convertDataToType(std::move(v1)); \ - Type arg2 = sycl::detail::convertDataToType(std::move(v2)); \ + auto arg1 = sycl::detail::convertToOpenCLType(std::move(v1)); \ + auto arg2 = sycl::detail::convertToOpenCLType(std::move(v2)); \ + using Type = decltype(arg1); \ + __SYCL_EXTERN_IT2_SAME(Type, prefix, call, arg1); \ Type ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)); \ return sycl::detail::convertDataToType(std::move(ret)); \ } @@ -88,14 +91,11 @@ #define __SYCL_MAKE_CALL_ARG3(call, prefix) \ template \ inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1, T2 t2, T3 t3) __NOEXC { \ + auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ + auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2)); \ + auto arg3 = sycl::detail::convertToOpenCLType(std::move(t3)); \ using Ret = sycl::detail::ConvertToOpenCLType_t; \ - using Arg1 = sycl::detail::ConvertToOpenCLType_t; \ - using Arg2 = sycl::detail::ConvertToOpenCLType_t; \ - using Arg3 = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3); \ - Arg1 arg1 = sycl::detail::convertDataToType(std::move(t1)); \ - Arg2 arg2 = sycl::detail::convertDataToType(std::move(t2)); \ - Arg3 arg3 = sycl::detail::convertDataToType(std::move(t3)); \ + __SYCL_EXTERN_IT3(Ret, prefix, call, arg1, arg2, arg3); \ Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2), \ std::move(arg3)); \ return sycl::detail::convertDataToType(std::move(ret)); \ diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index 36431fcdad13a..ea36fcf7cd669 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -697,6 +697,13 @@ convertDataToType(FROM t) { return ConvertNonVectorType(t); } +// Now fuse the above into a simpler helper that's easy to use. +// TODO: That should probably be moved outside of "type_traits". +template auto convertToOpenCLType(T &&x) { + using OpenCLType = ConvertToOpenCLType_t>; + return convertDataToType(std::forward(x)); +} + // Used for all, any and select relational built-in functions template inline constexpr T msbMask(T) { using UT = make_unsigned_t; diff --git a/sycl/include/sycl/detail/image_ocl_types.hpp b/sycl/include/sycl/detail/image_ocl_types.hpp index 60db7ccb645c8..57f17a33bd13f 100644 --- a/sycl/include/sycl/detail/image_ocl_types.hpp +++ b/sycl/include/sycl/detail/image_ocl_types.hpp @@ -62,13 +62,11 @@ template static void __invoke__ImageWrite(ImageT Img, CoordT Coords, ValT Val) { // Convert from sycl types to builtin types to get correct function mangling. - using TmpValT = sycl::detail::ConvertToOpenCLType_t; - using TmpCoordT = sycl::detail::ConvertToOpenCLType_t; + auto TmpCoords = sycl::detail::convertToOpenCLType(Coords); + auto TmpVal = sycl::detail::convertToOpenCLType(Val); - TmpCoordT TmpCoord = - sycl::detail::convertDataToType(Coords); - TmpValT TmpVal = sycl::detail::convertDataToType(Val); - __spirv_ImageWrite(Img, TmpCoord, TmpVal); + __spirv_ImageWrite( + Img, TmpCoords, TmpVal); } template @@ -76,10 +74,11 @@ static RetType __invoke__ImageRead(ImageT Img, CoordT Coords) { // Convert from sycl types to builtin types to get correct function mangling. using TempRetT = sycl::detail::ConvertToOpenCLType_t; - using TempArgT = sycl::detail::ConvertToOpenCLType_t; - TempArgT Arg = sycl::detail::convertDataToType(Coords); - TempRetT Ret = __spirv_ImageRead(Img, Arg); + auto TmpCoords = sycl::detail::convertToOpenCLType(Coords); + + TempRetT Ret = + __spirv_ImageRead(Img, TmpCoords); return sycl::detail::convertDataToType(Ret); } @@ -89,10 +88,7 @@ static RetType __invoke__ImageReadLod(SmpImageT SmpImg, CoordT Coords, // Convert from sycl types to builtin types to get correct function mangling. using TempRetT = sycl::detail::ConvertToOpenCLType_t; - using TempArgT = sycl::detail::ConvertToOpenCLType_t; - - TempArgT TmpCoords = - sycl::detail::convertDataToType(Coords); + auto TmpCoords = sycl::detail::convertToOpenCLType(Coords); enum ImageOperands { Lod = 0x2 }; @@ -102,8 +98,9 @@ static RetType __invoke__ImageReadLod(SmpImageT SmpImg, CoordT Coords, // Sampled Image must be an object whose type is OpTypeSampledImage // Image Operands encodes what operands follow. Either Lod // or Grad image operands must be present - TempRetT Ret = __spirv_ImageSampleExplicitLod( - SmpImg, TmpCoords, ImageOperands::Lod, Level); + TempRetT Ret = + __spirv_ImageSampleExplicitLod( + SmpImg, TmpCoords, ImageOperands::Lod, Level); return sycl::detail::convertDataToType(Ret); } @@ -113,12 +110,9 @@ static RetType __invoke__ImageReadGrad(SmpImageT SmpImg, CoordT Coords, // Convert from sycl types to builtin types to get correct function mangling. using TempRetT = sycl::detail::ConvertToOpenCLType_t; - using TempArgT = sycl::detail::ConvertToOpenCLType_t; - - TempArgT TmpCoords = - sycl::detail::convertDataToType(Coords); - TempArgT TmpGraddX = sycl::detail::convertDataToType(Dx); - TempArgT TmpGraddY = sycl::detail::convertDataToType(Dy); + auto TmpCoords = sycl::detail::convertToOpenCLType(Coords); + auto TmpGraddX = sycl::detail::convertToOpenCLType(Dx); + auto TmpGraddY = sycl::detail::convertToOpenCLType(Dy); enum ImageOperands { Grad = 0x3 }; @@ -128,8 +122,9 @@ static RetType __invoke__ImageReadGrad(SmpImageT SmpImg, CoordT Coords, // Sampled Image must be an object whose type is OpTypeSampledImage // Image Operands encodes what operands follow. Either Lod // or Grad image operands must be present - TempRetT Ret = __spirv_ImageSampleExplicitLod( - SmpImg, TmpCoords, ImageOperands::Grad, TmpGraddX, TmpGraddY); + TempRetT Ret = + __spirv_ImageSampleExplicitLod( + SmpImg, TmpCoords, ImageOperands::Grad, TmpGraddX, TmpGraddY); return sycl::detail::convertDataToType(Ret); } @@ -139,12 +134,10 @@ static RetType __invoke__ImageReadSampler(ImageT Img, CoordT Coords, // Convert from sycl types to builtin types to get correct function mangling. using TempRetT = sycl::detail::ConvertToOpenCLType_t; - using TempArgT = sycl::detail::ConvertToOpenCLType_t; using SampledT = typename sycl::detail::sampled_opencl_image_type::type; - TempArgT TmpCoords = - sycl::detail::convertDataToType(Coords); + auto TmpCoords = sycl::detail::convertToOpenCLType(Coords); // According to validation rules(SPIR-V specification, section 2.16.1) result // of __spirv_SampledImage is allowed to be an operand of image lookup // and query instructions explicitly specified to take an operand whose @@ -157,9 +150,10 @@ static RetType __invoke__ImageReadSampler(ImageT Img, CoordT Coords, enum ImageOperands { Lod = 0x2 }; // Lod value is zero as mipmap is not supported. - TempRetT Ret = __spirv_ImageSampleExplicitLod( - __spirv_SampledImage(Img, Smpl), TmpCoords, - ImageOperands::Lod, 0.0f); + TempRetT Ret = + __spirv_ImageSampleExplicitLod( + __spirv_SampledImage(Img, Smpl), TmpCoords, + ImageOperands::Lod, 0.0f); return sycl::detail::convertDataToType(Ret); } diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index ee80a50db417b..5a1215dbe2f8a 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -256,14 +256,11 @@ template <> struct GroupId<::sycl::sub_group> { }; template EnableIfNativeBroadcast GroupBroadcast(Group, T x, IdT local_id) { - using GroupIdT = typename GroupId::type; - GroupIdT GroupLocalId = static_cast(local_id); - using OCLT = detail::ConvertToOpenCLType_t; - using WidenedT = WidenOpenCLTypeTo32_t; - using OCLIdT = detail::ConvertToOpenCLType_t; - WidenedT OCLX = detail::convertDataToType(x); - OCLIdT OCLId = detail::convertDataToType(GroupLocalId); - return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); + auto GroupLocalId = static_cast::type>(local_id); + auto OCLX = detail::convertToOpenCLType(x); + WidenOpenCLTypeTo32_t WideOCLX = OCLX; + auto OCLId = detail::convertToOpenCLType(GroupLocalId); + return __spirv_GroupBroadcast(group_scope::value, WideOCLX, OCLId); } template EnableIfNativeBroadcast @@ -273,23 +270,20 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group g, auto LocalId = detail::IdToMaskPosition(g, local_id); // TODO: Refactor to avoid duplication after design settles. - using GroupIdT = typename GroupId::type; - GroupIdT GroupLocalId = static_cast(LocalId); - using OCLT = detail::ConvertToOpenCLType_t; - using WidenedT = WidenOpenCLTypeTo32_t; - using OCLIdT = detail::ConvertToOpenCLType_t; - WidenedT OCLX = detail::convertDataToType(x); - OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + auto GroupLocalId = static_cast::type>(LocalId); + auto OCLX = detail::convertToOpenCLType(x); + WidenOpenCLTypeTo32_t WideOCLX = OCLX; + auto OCLId = detail::convertToOpenCLType(GroupLocalId); // ballot_group partitions its parent into two groups (0 and 1) // We have to force each group down different control flow // Work-items in the "false" group (0) may still be active if (g.get_group_id() == 1) { return __spirv_GroupNonUniformBroadcast(group_scope::value, - OCLX, OCLId); + WideOCLX, OCLId); } else { return __spirv_GroupNonUniformBroadcast(group_scope::value, - OCLX, OCLId); + WideOCLX, OCLId); } } template @@ -300,20 +294,17 @@ EnableIfNativeBroadcast GroupBroadcast( auto LocalId = g.get_group_linear_id() * PartitionSize + local_id; // TODO: Refactor to avoid duplication after design settles. - using GroupIdT = typename GroupId::type; - GroupIdT GroupLocalId = static_cast(LocalId); - using OCLT = detail::ConvertToOpenCLType_t; - using WidenedT = WidenOpenCLTypeTo32_t; - using OCLIdT = detail::ConvertToOpenCLType_t; - WidenedT OCLX = detail::convertDataToType(x); - OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + auto GroupLocalId = static_cast::type>(LocalId); + auto OCLX = detail::convertToOpenCLType(x); + WidenOpenCLTypeTo32_t WideOCLX = OCLX; + auto OCLId = detail::convertToOpenCLType(GroupLocalId); // NonUniformBroadcast requires Id to be dynamically uniform, which does not // hold here; each partition is broadcasting a separate index. We could // fallback to either NonUniformShuffle or a NonUniformBroadcast per // partition, and it's unclear which will be faster in practice. - return __spirv_GroupNonUniformShuffle(group_scope::value, OCLX, - OCLId); + return __spirv_GroupNonUniformShuffle(group_scope::value, + WideOCLX, OCLId); } template EnableIfNativeBroadcast @@ -323,16 +314,13 @@ GroupBroadcast(ext::oneapi::experimental::tangle_group g, T x, auto LocalId = detail::IdToMaskPosition(g, local_id); // TODO: Refactor to avoid duplication after design settles. - using GroupIdT = typename GroupId::type; - GroupIdT GroupLocalId = static_cast(LocalId); - using OCLT = detail::ConvertToOpenCLType_t; - using WidenedT = WidenOpenCLTypeTo32_t; - using OCLIdT = detail::ConvertToOpenCLType_t; - WidenedT OCLX = detail::convertDataToType(x); - OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + auto GroupLocalId = static_cast::type>(LocalId); + auto OCLX = detail::convertToOpenCLType(x); + WidenOpenCLTypeTo32_t WideOCLX = OCLX; + auto OCLId = detail::convertToOpenCLType(GroupLocalId); - return __spirv_GroupNonUniformBroadcast(group_scope::value, OCLX, - OCLId); + return __spirv_GroupNonUniformBroadcast(group_scope::value, + WideOCLX, OCLId); } template EnableIfNativeBroadcast @@ -342,17 +330,15 @@ GroupBroadcast(const ext::oneapi::experimental::opportunistic_group &g, T x, auto LocalId = detail::IdToMaskPosition(g, local_id); // TODO: Refactor to avoid duplication after design settles. - using GroupIdT = typename GroupId<::sycl::sub_group>::type; - GroupIdT GroupLocalId = static_cast(LocalId); - using OCLT = detail::ConvertToOpenCLType_t; - using WidenedT = WidenOpenCLTypeTo32_t; - using OCLIdT = detail::ConvertToOpenCLType_t; - WidenedT OCLX = detail::convertDataToType(x); - OCLIdT OCLId = detail::convertDataToType(GroupLocalId); + auto GroupLocalId = + static_cast::type>(LocalId); + auto OCLX = detail::convertToOpenCLType(x); + WidenOpenCLTypeTo32_t WideOCLX = OCLX; + auto OCLId = detail::convertToOpenCLType(GroupLocalId); return __spirv_GroupNonUniformBroadcast( - group_scope::value, OCLX, - OCLId); + group_scope::value, + WideOCLX, OCLId); } template @@ -386,16 +372,14 @@ EnableIfNativeBroadcast GroupBroadcast(Group g, T x, return GroupBroadcast(g, x, local_id[0]); } using IdT = vec; - using OCLT = detail::ConvertToOpenCLType_t; - using WidenedT = WidenOpenCLTypeTo32_t; - using OCLIdT = detail::ConvertToOpenCLType_t; IdT VecId; for (int i = 0; i < Dimensions; ++i) { VecId[i] = local_id[Dimensions - i - 1]; } - WidenedT OCLX = detail::convertDataToType(x); - OCLIdT OCLId = detail::convertDataToType(VecId); - return __spirv_GroupBroadcast(group_scope::value, OCLX, OCLId); + auto OCLX = detail::convertToOpenCLType(x); + WidenOpenCLTypeTo32_t WideOCLX = OCLX; + auto OCLId = detail::convertToOpenCLType(VecId); + return __spirv_GroupBroadcast(group_scope::value, WideOCLX, OCLId); } template EnableIfNativeBroadcast diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index 14b36a29ccd73..3054e661b4a32 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -96,8 +96,8 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t< tanh(T x) __NOEXC { #if defined(__NVPTX__) using _ocl_T = sycl::detail::ConvertToOpenCLType_t; - _ocl_T arg1 = sycl::detail::convertDataToType(x); - return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_tanh(arg1)); + return sycl::detail::convertDataToType<_ocl_T, T>( + __clc_native_tanh(sycl::detail::convertToOpenCLType(x))); #else return __sycl_std::__invoke_tanh(x); #endif @@ -146,8 +146,8 @@ inline __SYCL_ALWAYS_INLINE exp2(T x) __NOEXC { #if defined(__NVPTX__) using _ocl_T = sycl::detail::ConvertToOpenCLType_t; - _ocl_T arg1 = sycl::detail::convertDataToType(x); - return sycl::detail::convertDataToType<_ocl_T, T>(__clc_native_exp2(arg1)); + return sycl::detail::convertDataToType<_ocl_T, T>( + __clc_native_exp2(sycl::detail::convertToOpenCLType(x))); #else return __sycl_std::__invoke_exp2(x); #endif From 6639e789f93354aa32f28d0ba8642186bfa4fd79 Mon Sep 17 00:00:00 2001 From: Lorenc Bushi <113361374+lbushi25@users.noreply.github.com> Date: Mon, 12 Feb 2024 14:31:51 -0500 Subject: [PATCH 2/6] [SYCL] Rewrite tests that fail when usm_shared_allocations not supported #2 (#12655) Continuation of https://github.com/intel/llvm/pull/12636. Refer to it for a description. --- .../discard_events_mixed_calls.cpp | 2 +- .../DiscardEvents/discard_events_usm.cpp | 2 +- .../discard_events_usm_ooo_queue.cpp | 2 +- sycl/test-e2e/DiscardEvents/invalid_event.cpp | 5 +- .../exclusive_scan_over_group.cpp | 55 ++++++----- sycl/test-e2e/GroupAlgorithm/root_group.cpp | 97 ++++++++++--------- .../InOrderEventsExt/get_last_event.cpp | 3 +- .../InOrderEventsExt/set_external_event.cpp | 23 +++-- .../KernelAndProgram/disable-caching.cpp | 2 +- .../sync_two_queues_event_dep.cpp | 34 ++++--- .../test-e2e/KernelFusion/sync_usm_mem_op.cpp | 34 ++++--- .../test-e2e/Reduction/reduction_internal.cpp | 18 ++-- .../Reduction/reduction_range_item.cpp | 30 +++--- sycl/test-e2e/Reduction/reduction_span.cpp | 9 +- .../Reduction/reduction_span_pack.cpp | 33 ++++--- .../Regression/exclusive-scan-char-short.cpp | 2 +- .../Regression/group_local_linear_id.cpp | 20 ++-- sycl/test-e2e/Regression/half_operators.cpp | 46 ++++----- sycl/test-e2e/Regression/pf-wg-atomic64.cpp | 5 +- .../Regression/range-rounding-this-id.cpp | 18 ++-- .../Regression/reduction_64bit_atomic64.cpp | 3 +- 21 files changed, 237 insertions(+), 206 deletions(-) diff --git a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp index 8206ea5b449c3..8399951d16298 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp @@ -24,7 +24,7 @@ // the tests, please check if they pass without the discard_events property, if // they don't pass then it's most likely a general issue unrelated to // discard_events. - +// REQUIRES: aspect-usm_shared_allocations #include #include #include diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp index 11288d6620bfd..48ab65c68896c 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_usm.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt - +// REQUIRES: aspect-usm_shared_allocations // The test checks that the last parameter is `nullptr` for all PI calls that // should discard events. // {{0|0000000000000000}} is required for various output on Linux and Windows. diff --git a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp index cfe72db0c1232..96d53a632beb6 100644 --- a/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp +++ b/sycl/test-e2e/DiscardEvents/discard_events_usm_ooo_queue.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt - +// REQUIRES: aspect-usm_shared_allocations // The test checks that the last parameter is not `nullptr` for all PI calls // that should discard events. // {{0|0000000000000000}} is required for various output on Linux and Windows. diff --git a/sycl/test-e2e/DiscardEvents/invalid_event.cpp b/sycl/test-e2e/DiscardEvents/invalid_event.cpp index 273e74afb6c25..17c6a492a10c2 100644 --- a/sycl/test-e2e/DiscardEvents/invalid_event.cpp +++ b/sycl/test-e2e/DiscardEvents/invalid_event.cpp @@ -2,7 +2,6 @@ // https://github.com/intel/llvm/issues/7330. // UNSUPPORTED: opencl && gpu // RUN: %{build} -o %t.out - // RUN: %{run} %t.out // The test checks that each PI call to the queue returns a discarded event @@ -19,9 +18,9 @@ void QueueAPIsReturnDiscardedEvent(sycl::queue Q) { sycl::range<1> range(BUFFER_SIZE); auto Dev = Q.get_device(); - int *x = sycl::malloc_shared(BUFFER_SIZE, Q); + int *x = sycl::malloc_device(BUFFER_SIZE, Q); assert(x != nullptr); - int *y = sycl::malloc_shared(BUFFER_SIZE, Q); + int *y = sycl::malloc_device(BUFFER_SIZE, Q); assert(y != nullptr); sycl::event DiscardedEvent; diff --git a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp index 6411131ef33ff..8f7ee3a55352c 100644 --- a/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp @@ -9,11 +9,14 @@ #include #include -template -sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data, - T *out_data, int num_wg, int group_size) { +template +sycl::event compiler_group_scan_impl(sycl::queue *queue, AccessorT &in_data, + AccessorT &out_data, int num_wg, + int group_size) { sycl::nd_range<1> thread_range(num_wg * group_size, group_size); sycl::event event = queue->submit([&](sycl::handler &cgh) { + cgh.require(in_data); + cgh.require(out_data); cgh.parallel_for(thread_range, [=](sycl::nd_item<1> item) { auto id = item.get_global_linear_id(); auto group = item.get_group(); @@ -27,33 +30,35 @@ sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data, return event; } -template -void test_compiler_group_scan(sycl::queue *queue, T *in_data, T *out_data, - int num_wg, int group_size) { - compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size); +template +void test_compiler_group_scan(sycl::queue *queue, AccessorT &in_data, + AccessorT &out_data, int num_wg, int group_size) { + compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size); } int main(int argc, const char **argv) { - int num_wg = 1; - int group_size = 16; + constexpr int num_wg = 1; + constexpr int group_size = 16; sycl::queue queue; - - typedef int T; - size_t nelems = num_wg * group_size; - T *data = sycl::malloc_shared(nelems, queue); - T *result = sycl::malloc_shared(nelems, queue); - queue.fill(data, T(2), nelems).wait(); - queue.memset(result, 0, nelems * sizeof(T)).wait(); - - test_compiler_group_scan(&queue, data, result, num_wg, group_size); - queue.wait(); - T expected[] = {1, 2, 4, 8, 16, 32, 64, 128, - 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}; - for (int i = 0; i < sizeof(expected) / sizeof(T); ++i) { - assert(result[i] == expected[i]); + constexpr size_t nelems = num_wg * group_size; + int data[nelems]; + int result[nelems]; + for (size_t i = 0; i < nelems; ++i) { + data[i] = 2; + result[i] = 0; + } + sycl::buffer data_buf{&data[0], sycl::range{nelems}}; + sycl::buffer result_buf{&result[0], sycl::range{nelems}}; + sycl::accessor data_acc{data_buf}; + sycl::accessor result_acc{result_buf}; + test_compiler_group_scan(&queue, data_acc, result_acc, num_wg, + group_size); + sycl::host_accessor result_host{result_buf}; + int expected[] = {1, 2, 4, 8, 16, 32, 64, 128, + 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}; + for (int i = 0; i < sizeof(expected) / sizeof(int); ++i) { + assert(result_host[i] == expected[i]); } - sycl::free(data, queue); - sycl::free(result, queue); return 0; } diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index ba0c49fa68bf7..e346ea142b759 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -41,27 +41,29 @@ void testRootGroup() { max_num_work_group_sync>(q); const auto props = sycl::ext::oneapi::experimental::properties{ sycl::ext::oneapi::experimental::use_root_sync}; - - int *data = sycl::malloc_shared(maxWGs * WorkGroupSize, q); + sycl::buffer dataBuf{sycl::range{maxWGs * WorkGroupSize}}; const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize}; - q.parallel_for(range, props, [=](sycl::nd_item<1> it) { - auto root = it.ext_oneapi_get_root_group(); - data[root.get_local_id()] = root.get_local_id(); - sycl::group_barrier(root); + q.submit([&](sycl::handler &h) { + sycl::accessor data{dataBuf, h}; + h.parallel_for( + range, props, [=](sycl::nd_item<1> it) { + auto root = it.ext_oneapi_get_root_group(); + data[root.get_local_id()] = root.get_local_id(); + sycl::group_barrier(root); - root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>(); - int sum = data[root.get_local_id()] + - data[root.get_local_range() - root.get_local_id() - 1]; - sycl::group_barrier(root); - data[root.get_local_id()] = sum; + root = + sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>(); + int sum = data[root.get_local_id()] + + data[root.get_local_range() - root.get_local_id() - 1]; + sycl::group_barrier(root); + data[root.get_local_id()] = sum; + }); }); - q.wait(); - + sycl::host_accessor data{dataBuf}; const int workItemCount = static_cast(range.get_global_range().size()); for (int i = 0; i < workItemCount; i++) { assert(data[i] == (workItemCount - 1)); } - sycl::free(data, q); } void testRootGroupFunctions() { @@ -76,44 +78,45 @@ void testRootGroupFunctions() { sycl::ext::oneapi::experimental::use_root_sync}; constexpr int testCount = 10; - bool *testResults = sycl::malloc_shared(testCount, q); + sycl::buffer testResultsBuf{sycl::range{testCount}}; const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize}; - q.parallel_for( - range, props, [=](sycl::nd_item<1> it) { - const auto root = it.ext_oneapi_get_root_group(); - if (root.leader() || root.get_local_id() == 3) { - testResults[0] = root.get_group_id() == sycl::id<1>(0); - testResults[1] = root.leader() - ? root.get_local_id() == sycl::id<1>(0) - : root.get_local_id() == sycl::id<1>(3); - testResults[2] = root.get_group_range() == sycl::range<1>(1); - testResults[3] = - root.get_local_range() == sycl::range<1>(WorkGroupSize); - testResults[4] = - root.get_max_local_range() == sycl::range<1>(WorkGroupSize); - testResults[5] = root.get_group_linear_id() == 0; - testResults[6] = - root.get_local_linear_id() == root.get_local_id().get(0); - testResults[7] = root.get_group_linear_range() == 1; - testResults[8] = root.get_local_linear_range() == WorkGroupSize; - - const auto child = - sycl::ext::oneapi::experimental::get_child_group(root); - const auto grandchild = - sycl::ext::oneapi::experimental::get_child_group(child); - testResults[9] = child == it.get_group(); - static_assert( - std::is_same_v::type, - sycl::sub_group>, - "get_child_group(sycl::group) must return a sycl::sub_group"); - } - }); - q.wait(); + q.submit([&](sycl::handler &h) { + sycl::accessor testResults{testResultsBuf, h}; + h.parallel_for( + range, props, [=](sycl::nd_item<1> it) { + const auto root = it.ext_oneapi_get_root_group(); + if (root.leader() || root.get_local_id() == 3) { + testResults[0] = root.get_group_id() == sycl::id<1>(0); + testResults[1] = root.leader() + ? root.get_local_id() == sycl::id<1>(0) + : root.get_local_id() == sycl::id<1>(3); + testResults[2] = root.get_group_range() == sycl::range<1>(1); + testResults[3] = + root.get_local_range() == sycl::range<1>(WorkGroupSize); + testResults[4] = + root.get_max_local_range() == sycl::range<1>(WorkGroupSize); + testResults[5] = root.get_group_linear_id() == 0; + testResults[6] = + root.get_local_linear_id() == root.get_local_id().get(0); + testResults[7] = root.get_group_linear_range() == 1; + testResults[8] = root.get_local_linear_range() == WorkGroupSize; + const auto child = + sycl::ext::oneapi::experimental::get_child_group(root); + const auto grandchild = + sycl::ext::oneapi::experimental::get_child_group(child); + testResults[9] = child == it.get_group(); + static_assert( + std::is_same_v::type, + sycl::sub_group>, + "get_child_group(sycl::group) must return a sycl::sub_group"); + } + }); + }); + sycl::host_accessor testResults{testResultsBuf}; for (int i = 0; i < testCount; i++) { assert(testResults[i]); } - sycl::free(testResults, q); } int main() { diff --git a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp index 3393202b5a370..aeb4ab44acb49 100644 --- a/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp +++ b/sycl/test-e2e/InOrderEventsExt/get_last_event.cpp @@ -34,7 +34,8 @@ int main() { Failed += Check(Q, "host_task", [&]() { return Q.submit([&](sycl::handler &CGH) { CGH.host_task([]() {}); }); }); - + if (!Q.get_device().has(sycl::aspect::usm_shared_allocations)) + return Failed; constexpr size_t N = 64; int *Data1 = sycl::malloc_shared(N, Q); int *Data2 = sycl::malloc_shared(N, Q); diff --git a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp index 45e5815606dbe..61987d5b5b3cb 100644 --- a/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp +++ b/sycl/test-e2e/InOrderEventsExt/set_external_event.cpp @@ -15,20 +15,26 @@ int main() { sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}}; sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}}; - int *DevData = sycl::malloc_shared(N, Dev, Ctx); + sycl::buffer DevDataBuf{sycl::range{N}}; + sycl::accessor DevData{DevDataBuf}; int *HostData = (int *)malloc(N * sizeof(int) * 10); for (size_t I = 0; I < 10; ++I) { - Q1.fill(DevData, 0, N); - sycl::event E1 = Q1.parallel_for( - N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; }); + Q1.fill(DevData, 0); + sycl::event E1 = Q1.submit([&](sycl::handler &h) { + h.require(DevData); + h.parallel_for( + N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; }); + }); Q2.ext_oneapi_set_external_event(E1); - sycl::event E2 = - Q2.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; }); + sycl::event E2 = Q2.submit([&](sycl::handler &h) { + h.require(DevData); + h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; }); + }); Q1.ext_oneapi_set_external_event(E2); - Q1.copy(DevData, HostData + N * I, N); + Q1.copy(DevData, HostData + N * I); } Q1.wait_and_throw(); @@ -46,9 +52,6 @@ int main() { } } } - - sycl::free(DevData, Ctx); free(HostData); - return Failures; } diff --git a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp index 4276efe36366f..c50e23b1c17c4 100644 --- a/sycl/test-e2e/KernelAndProgram/disable-caching.cpp +++ b/sycl/test-e2e/KernelAndProgram/disable-caching.cpp @@ -74,7 +74,7 @@ int main() { // CHECK-CACHE: piKernelRelease // CHECK-CACHE: piProgramRelease // CHECK-CACHE: piEventsWait - auto *p = malloc_shared(1, q); + auto *p = malloc_device(1, q); for (int i = 0; i < 2; ++i) q.submit([&](handler &cgh) { cgh.set_specialization_constant(i); diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp index 38286c48b8a0f..79347114ec2a4 100644 --- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp +++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp @@ -14,19 +14,21 @@ int main() { queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}}; queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}}; - int *in1 = sycl::malloc_shared(dataSize, q1); - int *in2 = sycl::malloc_shared(dataSize, q1); - int *in3 = sycl::malloc_shared(dataSize, q1); - int *tmp = sycl::malloc_shared(dataSize, q1); - int *out = sycl::malloc_shared(dataSize, q1); - - for (size_t i = 0; i < dataSize; ++i) { - in1[i] = i * 2; - in2[i] = i * 3; - in3[i] = i * 4; - tmp[i] = -1; - out[i] = -1; - } + int *in1 = sycl::malloc_device(dataSize, q1); + int *in2 = sycl::malloc_device(dataSize, q1); + int *in3 = sycl::malloc_device(dataSize, q1); + int *tmp = sycl::malloc_device(dataSize, q1); + int *out = sycl::malloc_device(dataSize, q1); + + q1.single_task([=]() { + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + }).wait(); ext::codeplay::experimental::fusion_wrapper fw1{q1}; fw1.start_fusion(); @@ -71,10 +73,12 @@ int main() { q1.wait(); q2.wait(); - + int host_out[dataSize]; + q1.memcpy(host_out, out, dataSize * sizeof(int)); + q1.wait(); // Check the results for (size_t i = 0; i < dataSize; ++i) { - assert(out[i] == (40 * i * i) && "Computation error"); + assert(host_out[i] == (40 * i * i) && "Computation error"); } sycl::free(in1, q1); sycl::free(in2, q1); diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp index 0e6bc288812fd..f3d2cc97b9ae5 100644 --- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp +++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp @@ -1,6 +1,5 @@ // RUN: %{build} -fsycl-embed-ir -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s - // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows @@ -16,21 +15,24 @@ int main() { queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; - int *in1 = sycl::malloc_shared(dataSize, q); - int *in2 = sycl::malloc_shared(dataSize, q); - int *in3 = sycl::malloc_shared(dataSize, q); - int *tmp = sycl::malloc_shared(dataSize, q); - int *out = sycl::malloc_shared(dataSize, q); + int *in1 = sycl::malloc_device(dataSize, q); + int *in2 = sycl::malloc_device(dataSize, q); + int *in3 = sycl::malloc_device(dataSize, q); + int *tmp = sycl::malloc_device(dataSize, q); + int *out = sycl::malloc_device(dataSize, q); int dst[dataSize]; - for (size_t i = 0; i < dataSize; ++i) { - in1[i] = i * 2; - in2[i] = i * 3; - in3[i] = i * 4; - tmp[i] = -1; - out[i] = -1; dst[i] = -1; } + q.single_task([=]() { + for (size_t i = 0; i < dataSize; ++i) { + in1[i] = i * 2; + in2[i] = i * 3; + in3[i] = i * 4; + tmp[i] = -1; + out[i] = -1; + } + }).wait(); ext::codeplay::experimental::fusion_wrapper fw{q}; fw.start_fusion(); @@ -59,14 +61,16 @@ int main() { "Queue should not be in fusion mode anymore"); fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); - + int host_out[dataSize]; + q.memcpy(host_out, out, dataSize * sizeof(int)); + q.wait(); for (size_t i = 0; i < dataSize; ++i) { - std::cout << out[i] << ", "; + std::cout << host_out[i] << ", "; } std::cout << "\n"; // Check the results for (size_t i = 0; i < dataSize; ++i) { - assert(out[i] == (20 * i * i) && "Computation error"); + assert(host_out[i] == (20 * i * i) && "Computation error"); assert(dst[i] == (5 * i) && "Computation error"); } diff --git a/sycl/test-e2e/Reduction/reduction_internal.cpp b/sycl/test-e2e/Reduction/reduction_internal.cpp index 57947a25176e1..39c64932802da 100644 --- a/sycl/test-e2e/Reduction/reduction_internal.cpp +++ b/sycl/test-e2e/Reduction/reduction_internal.cpp @@ -80,13 +80,13 @@ static void test(RedStorage &Storage, RangeTy Range) { cgh, Range, ext::oneapi::experimental::empty_properties_t{}, RedSycl, [=](auto Item, auto &Red) { Red.combine(T{1}); }); }).wait(); - - auto *Result = malloc_shared(1, q); + sycl::buffer ResultBuf{sycl::range{1}}; q.submit([&](handler &cgh) { - auto RedAcc = GetRedAcc(cgh); - cgh.single_task([=]() { *Result = RedAcc[0]; }); - }).wait(); - + sycl::accessor Result{ResultBuf, cgh}; + auto RedAcc = GetRedAcc(cgh); + cgh.single_task([=]() { Result[0] = RedAcc[0]; }); + }); + sycl::host_accessor Result{ResultBuf}; auto N = get_global_range(Range).size(); int Expected = InitToIdentity ? N : Init + N; #if defined(__PRETTY_FUNCTION__) @@ -94,10 +94,8 @@ static void test(RedStorage &Storage, RangeTy Range) { #elif defined(__FUNCSIG__) std::cout << __FUNCSIG__; #endif - std::cout << ": " << *Result << ", expected " << Expected << std::endl; - assert(*Result == Expected); - - free(Result, q); + std::cout << ": " << Result[0] << ", expected " << Expected << std::endl; + assert(Result[0] == Expected); } template diff --git a/sycl/test-e2e/Reduction/reduction_range_item.cpp b/sycl/test-e2e/Reduction/reduction_range_item.cpp index ed9806e4753b5..93fe915ac0254 100644 --- a/sycl/test-e2e/Reduction/reduction_range_item.cpp +++ b/sycl/test-e2e/Reduction/reduction_range_item.cpp @@ -3,18 +3,20 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows - #include using namespace sycl; int main() { queue q; - auto *RedMem = malloc_shared(1, q); - auto *Success = malloc_shared(1, q); - *Success = true; - - *RedMem = 0; + auto *RedMem = malloc_device(1, q); + auto *Success = malloc_device(1, q); + int RedMemHost; + bool SuccessHost; + RedMemHost = 0; + SuccessHost = true; + q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait(); + q.memcpy(Success, &SuccessHost, sizeof(bool)).wait(); q.parallel_for(range<1>{7}, reduction(RedMem, std::plus{}), [=](item<1> Item, auto &Red) { Red += 1; @@ -24,11 +26,13 @@ int main() { *Success = false; }) .wait(); + q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait(); + q.memcpy(&SuccessHost, Success, sizeof(bool)).wait(); + assert(RedMemHost == 7); + assert(SuccessHost); - assert(*RedMem == 7); - assert(*Success); - - *RedMem = 0; + RedMemHost = 0; + q.memcpy(RedMem, &RedMemHost, sizeof(int)).wait(); q.parallel_for(range<2>{1030, 7}, reduction(RedMem, std::plus{}), [=](item<2> Item, auto &Red) { Red += 1; @@ -44,8 +48,10 @@ int main() { }) .wait(); - assert(*RedMem == 1030 * 7); - assert(*Success); + q.memcpy(&RedMemHost, RedMem, sizeof(int)).wait(); + q.memcpy(&SuccessHost, Success, sizeof(bool)).wait(); + assert(RedMemHost == 1030 * 7); + assert(SuccessHost); free(RedMem, q); free(Success, q); diff --git a/sycl/test-e2e/Reduction/reduction_span.cpp b/sycl/test-e2e/Reduction/reduction_span.cpp index 6c06d377eabe2..23ee61e805f9b 100644 --- a/sycl/test-e2e/Reduction/reduction_span.cpp +++ b/sycl/test-e2e/Reduction/reduction_span.cpp @@ -3,7 +3,6 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows - // This test performs basic checks of reductions initialized with a sycl::span #include @@ -44,7 +43,7 @@ template (N, Q); + T *Output = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait(); // Perform generalized "histogram" with N bins @@ -70,11 +69,13 @@ void test(queue Q, Range Rng, T Identity, T Value) { } bool Passed = true; + T OutputHost[N]; + Q.memcpy(OutputHost, Output, N * sizeof(T)).wait(); for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output[I] == Expected); + Passed &= (OutputHost[I] == Expected); } else { - Passed &= (Output[I] == ExpectedRemainder); + Passed &= (OutputHost[I] == ExpectedRemainder); } } diff --git a/sycl/test-e2e/Reduction/reduction_span_pack.cpp b/sycl/test-e2e/Reduction/reduction_span_pack.cpp index 46862ffe45cf9..4bc4cfb5f9f14 100644 --- a/sycl/test-e2e/Reduction/reduction_span_pack.cpp +++ b/sycl/test-e2e/Reduction/reduction_span_pack.cpp @@ -6,7 +6,6 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows - // This test performs basic checks of reductions initialized with a pack // containing at least one sycl::span @@ -49,9 +48,9 @@ template (1, Q); + int *Sum = malloc_device(1, Q); Q.single_task([=]() { *Sum = 0; }).wait(); - T *Output = malloc_shared(N, Q); + T *Output = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait(); // Perform generalized "histogram" with N bins @@ -81,14 +80,18 @@ void test1(queue Q, Range Rng, T Identity, T Value) { } bool Passed = true; + T OutputHost[N]; + Q.memcpy(OutputHost, Output, N * sizeof(T)).wait(); for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output[I] == Expected); + Passed &= (OutputHost[I] == Expected); } else { - Passed &= (Output[I] == ExpectedRemainder); + Passed &= (OutputHost[I] == ExpectedRemainder); } } - Passed &= (*Sum == Size); + int SumHost; + Q.memcpy(&SumHost, Sum, sizeof(int)).wait(); + Passed &= (SumHost == Size); free(Output, Q); free(Sum, Q); @@ -101,9 +104,9 @@ template (N, Q); + int *Output1 = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output1[I] = 0; }).wait(); - T *Output2 = malloc_shared(N, Q); + T *Output2 = malloc_device(N, Q); Q.parallel_for(range<1>{N}, [=](id<1> I) { Output2[I] = Identity; }).wait(); // Perform generalized "histogram" with N bins @@ -121,7 +124,10 @@ void test2(queue Q, Range Rng, T Identity, T Value) { } else /*if (SubmissionMode == submission_mode::queue) */ { Q.parallel_for(Rng, Redu1, Redu2, Kern).wait(); } - + int Output1Host[N]; + T Output2Host[N]; + Q.memcpy(Output1Host, Output1, N * sizeof(int)).wait(); + Q.memcpy(Output2Host, Output2, N * sizeof(T)).wait(); size_t Size = getLinearSize(Rng); bool Passed = true; // Span1 @@ -132,12 +138,11 @@ void test2(queue Q, Range Rng, T Identity, T Value) { ExpectedRemainder = Expected; Expected += 1; } - for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output1[I] == Expected); + Passed &= (Output1Host[I] == Expected); } else { - Passed &= (Output1[I] == ExpectedRemainder); + Passed &= (Output1Host[I] == ExpectedRemainder); } } } @@ -153,9 +158,9 @@ void test2(queue Q, Range Rng, T Identity, T Value) { for (size_t I = 0; I < N; ++I) { if (I < Size % N) { - Passed &= (Output2[I] == Expected); + Passed &= (Output2Host[I] == Expected); } else { - Passed &= (Output2[I] == ExpectedRemainder); + Passed &= (Output2Host[I] == ExpectedRemainder); } } } diff --git a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp index a7d3601210fa8..81e5ddc8b1a27 100644 --- a/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp +++ b/sycl/test-e2e/Regression/exclusive-scan-char-short.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out - +// REQUIRES: aspect-usm_shared_allocations // This test ensures the result computed by exclusive_scan_over_group // for the first work item when given a short or char argument with // the maximum or minimum operator is computed correctly. diff --git a/sycl/test-e2e/Regression/group_local_linear_id.cpp b/sycl/test-e2e/Regression/group_local_linear_id.cpp index d3562c2b25cbf..8fc216f76df31 100644 --- a/sycl/test-e2e/Regression/group_local_linear_id.cpp +++ b/sycl/test-e2e/Regression/group_local_linear_id.cpp @@ -11,14 +11,17 @@ int main() { const sycl::range<3> GlobalRange(2, 8, 16); const sycl::range<3> LocalRange(2, 4, 4); sycl::queue Q; - bool *ReadSame = sycl::malloc_shared(GlobalRange.size(), Q); - Q.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange}, - [=](sycl::nd_item<3> Item) { - ReadSame[Item.get_global_linear_id()] = - Item.get_local_linear_id() == - Item.get_group().get_local_linear_id(); - }) - .wait(); + sycl::buffer ReadSame_buf{GlobalRange.size()}; + Q.submit([&](sycl::handler &h) { + sycl::accessor ReadSame{ReadSame_buf, h}; + h.parallel_for(sycl::nd_range<3>{GlobalRange, LocalRange}, + [=](sycl::nd_item<3> Item) { + ReadSame[Item.get_global_linear_id()] = + Item.get_local_linear_id() == + Item.get_group().get_local_linear_id(); + }); + }); + sycl::host_accessor ReadSame{ReadSame_buf}; int Failures = 0; for (size_t I = 0; I < GlobalRange.size(); ++I) { if (ReadSame[I]) @@ -26,6 +29,5 @@ int main() { ++Failures; std::cout << "Read mismatch at index " << I << std::endl; } - sycl::free(ReadSame, Q); return Failures; } diff --git a/sycl/test-e2e/Regression/half_operators.cpp b/sycl/test-e2e/Regression/half_operators.cpp index b227806a02bc0..14f6924260ec6 100644 --- a/sycl/test-e2e/Regression/half_operators.cpp +++ b/sycl/test-e2e/Regression/half_operators.cpp @@ -7,11 +7,6 @@ using namespace sycl; -template -using shared_allocator = sycl::usm_allocator; - -template using shared_vector = std::vector>; - template bool are_bitwise_equal(T lhs, T rhs) { constexpr size_t size{sizeof(T)}; @@ -33,33 +28,34 @@ template bool test(sycl::queue &queue) { static const T inexact = static_cast(0.1); - shared_vector result_source{NumElems, shared_allocator{queue}}; - shared_vector input{NumElems, shared_allocator{queue}}; + std::vector result_source_vec(NumElems); + std::vector input_vec(NumElems); for (size_t i = 0; i < NumElems; ++i) { - input[i] = inexact * i; + input_vec[i] = inexact * i; } - - queue.submit([&](sycl::handler &cgh) { - auto out_source = result_source.data(); - auto in = input.data(); - - cgh.single_task<>([=]() { - for (size_t i = 0; i < NumElems; ++i) { - auto source = in[i]; - ++source; - out_source[i] = source; - } + { + sycl::buffer result_source_buf{result_source_vec}; + sycl::buffer input_buf{input_vec}; + queue.submit([&](sycl::handler &cgh) { + sycl::accessor out_source{result_source_buf, cgh}; + sycl::accessor in{input_buf, cgh}; + cgh.single_task<>([=]() { + for (size_t i = 0; i < NumElems; ++i) { + auto source = in[i]; + ++source; + out_source[i] = source; + } + }); }); - }); - queue.wait_and_throw(); - + queue.wait_and_throw(); + } // buffers go out of scope here and write back to the vectors for (size_t i = 0; i < NumElems; ++i) { - T expected_value = input[i] + 1; + T expected_value = input_vec[i] + 1; - if (!are_bitwise_equal(expected_value, result_source[i])) { + if (!are_bitwise_equal(expected_value, result_source_vec[i])) { pass = false; - std::cout << "Sample failed retrieved value: " << result_source[i] + std::cout << "Sample failed retrieved value: " << result_source_vec[i] << ", but expected: " << expected_value << ", at index: " << i << std::endl; } diff --git a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp index f2985b5a33b1d..d7adc4e6c153b 100644 --- a/sycl/test-e2e/Regression/pf-wg-atomic64.cpp +++ b/sycl/test-e2e/Regression/pf-wg-atomic64.cpp @@ -10,11 +10,12 @@ using AtomicRefT = int main() { queue q; - auto *p = malloc_shared(1, q); + sycl::buffer p_buf{sycl::range{1}}; try { q.submit([&](sycl::handler &cgh) { + sycl::accessor p{p_buf, cgh}; cgh.parallel_for_work_group(range{1}, range{1}, [=](group<1>) { - AtomicRefT feature(*p); + AtomicRefT feature(p[0]); feature += 42; }); }).wait(); diff --git a/sycl/test-e2e/Regression/range-rounding-this-id.cpp b/sycl/test-e2e/Regression/range-rounding-this-id.cpp index 33fa41c60cc68..a5970adf46e41 100644 --- a/sycl/test-e2e/Regression/range-rounding-this-id.cpp +++ b/sycl/test-e2e/Regression/range-rounding-this-id.cpp @@ -30,13 +30,17 @@ template void test(queue &q) { id this_id; id ref_id; }; - std::vector> vec(range.size(), q); - auto *p = vec.data(); - q.parallel_for(range, [=](auto it) { - p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id(), - it.get_id()}; - }).wait_and_throw(); - + std::vector vec(range.size()); + { + sycl::buffer p_buf{vec}; + q.submit([&](sycl::handler &h) { + sycl::accessor p{p_buf, h}; + h.parallel_for(range, [=](auto it) { + p[it.get_linear_id()] = {sycl::ext::oneapi::experimental::this_id(), + it.get_id()}; + }); + }).wait_and_throw(); + } // p_buf goes out of scope here and writed back to vec for (const auto &[this_item, ref_item] : vec) { if (this_item != ref_item) { std::cout << "fail: " << this_item << " != " << ref_item << "\n"; diff --git a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp index dc138d9b79da8..c2d0d3e84ca7c 100644 --- a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp +++ b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp @@ -2,7 +2,6 @@ // RUN: %{build} -o %t.out // // RUN: %{run} %t.out - // Tests that a previously known case for reduction doesn't cause a requirement // for atomic64. // TODO: When aspect requirements are added to testing, this test could be set @@ -19,7 +18,7 @@ using namespace sycl; int main() { queue Q; - long long *Out = malloc_shared(1, Q); + long long *Out = malloc_device(1, Q); // Case 1: nd_range reduction with 64-bit integer and either sycl::plus, // sycl::minimum or sycl::maximum. group_reduce_and_atomic_cross_wg strategy From 3843e6bcadc356c45a73a67139db84776f4c5d68 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 12 Feb 2024 13:12:49 -0800 Subject: [PATCH 3/6] [NFCI][SYCL] Introduce convertFromOpenCLTypeFor(x) helper (#12684) Continuation of refactoring series initiated in https://github.com/intel/llvm/pull/12674. --- sycl/include/sycl/detail/builtins.hpp | 58 +++++++++---------- .../sycl/detail/generic_type_traits.hpp | 12 ++++ sycl/include/sycl/detail/image_ocl_types.hpp | 26 ++++----- .../sycl/ext/oneapi/experimental/builtins.hpp | 6 +- 4 files changed, 52 insertions(+), 50 deletions(-) diff --git a/sycl/include/sycl/detail/builtins.hpp b/sycl/include/sycl/detail/builtins.hpp index d0d76cc1c6925..99fc593156d06 100644 --- a/sycl/include/sycl/detail/builtins.hpp +++ b/sycl/include/sycl/detail/builtins.hpp @@ -9,7 +9,7 @@ #pragma once #include // for __SYCL_ALWAYS_INLINE -#include // for convertDataToType +#include // for to/from OpenCLType converts #include // TODO Decide whether to mark functions with this attribute. @@ -18,27 +18,31 @@ #ifdef __SYCL_DEVICE_ONLY__ #define __FUNC_PREFIX_OCL __spirv_ocl_ #define __FUNC_PREFIX_CORE __spirv_ -#define __SYCL_EXTERN_IT1(Ret, prefix, call, arg1) -#define __SYCL_EXTERN_IT2(Ret, prefix, call, arg1, arg2) -#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, arg) -#define __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3) +#define __SYCL_EXTERN_IT1(R, prefix, call, arg1) +#define __SYCL_EXTERN_IT2(R, prefix, call, arg1, arg2) +#define __SYCL_EXTERN_IT2_SAME(R, prefix, call, arg) +#define __SYCL_EXTERN_IT3(R, prefix, call, Arg1, Arg2, Arg3) #else #define __FUNC_PREFIX_OCL sycl_host_ #define __FUNC_PREFIX_CORE sycl_host_ -#define __SYCL_EXTERN_IT1(Ret, prefix, call, arg) \ +#define __SYCL_EXTERN_IT1(R, prefix, call, arg) \ using Arg = decltype(arg); \ + using Ret = sycl::detail::ConvertToOpenCLType_t; \ extern Ret __SYCL_PPCAT(prefix, call)(Arg) -#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, arg) \ +#define __SYCL_EXTERN_IT2_SAME(R, prefix, call, arg) \ using Arg = decltype(arg); \ + using Ret = sycl::detail::ConvertToOpenCLType_t; \ extern Ret __SYCL_PPCAT(prefix, call)(Arg, Arg) -#define __SYCL_EXTERN_IT2(Ret, prefix, call, arg1, arg2) \ +#define __SYCL_EXTERN_IT2(R, prefix, call, arg1, arg2) \ using Arg1 = decltype(arg1); \ using Arg2 = decltype(arg2); \ + using Ret = sycl::detail::ConvertToOpenCLType_t; \ extern Ret __SYCL_PPCAT(prefix, call)(Arg1, Arg2) -#define __SYCL_EXTERN_IT3(Ret, prefix, call, arg1, arg2, arg3) \ +#define __SYCL_EXTERN_IT3(R, prefix, call, arg1, arg2, arg3) \ using Arg1 = decltype(arg1); \ using Arg2 = decltype(arg2); \ using Arg3 = decltype(arg3); \ + using Ret = sycl::detail::ConvertToOpenCLType_t; \ extern Ret __SYCL_PPCAT(prefix, call)(Arg1, Arg2, Arg3) #endif @@ -49,10 +53,9 @@ template \ inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1) __NOEXC { \ auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ - using Ret = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT1(Ret, prefix, call, arg1); \ - Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1)); \ - return sycl::detail::convertDataToType(std::move(ret)); \ + __SYCL_EXTERN_IT1(R, prefix, call, arg1); \ + return sycl::detail::convertFromOpenCLTypeFor( \ + __SYCL_PPCAT(prefix, call)(std::move(arg1))); \ } #define __SYCL_MAKE_CALL_ARG2(call, prefix) \ @@ -60,10 +63,9 @@ inline __SYCL_ALWAYS_INLINE R __invoke_##call(T1 t1, T2 t2) __NOEXC { \ auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2)); \ - using Ret = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT2(Ret, prefix, call, arg1, arg2); \ - Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)); \ - return sycl::detail::convertDataToType(std::move(ret)); \ + __SYCL_EXTERN_IT2(R, prefix, call, arg1, arg2); \ + return sycl::detail::convertFromOpenCLTypeFor( \ + __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2))); \ } #define __SYCL_MAKE_CALL_ARG2_SAME(call, prefix) \ @@ -71,10 +73,9 @@ inline __SYCL_ALWAYS_INLINE R __invoke_##call(T t1, T t2) __NOEXC { \ auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2)); \ - using Ret = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, arg1); \ - Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)); \ - return sycl::detail::convertDataToType(std::move(ret)); \ + __SYCL_EXTERN_IT2_SAME(R, prefix, call, arg1); \ + return sycl::detail::convertFromOpenCLTypeFor( \ + __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2))); \ } #define __SYCL_MAKE_CALL_ARG2_SAME_RESULT(call, prefix) \ @@ -82,10 +83,9 @@ inline __SYCL_ALWAYS_INLINE T __invoke_##call(T v1, T v2) __NOEXC { \ auto arg1 = sycl::detail::convertToOpenCLType(std::move(v1)); \ auto arg2 = sycl::detail::convertToOpenCLType(std::move(v2)); \ - using Type = decltype(arg1); \ - __SYCL_EXTERN_IT2_SAME(Type, prefix, call, arg1); \ - Type ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2)); \ - return sycl::detail::convertDataToType(std::move(ret)); \ + __SYCL_EXTERN_IT2_SAME(T, prefix, call, arg1); \ + return sycl::detail::convertFromOpenCLTypeFor( \ + __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2))); \ } #define __SYCL_MAKE_CALL_ARG3(call, prefix) \ @@ -94,11 +94,9 @@ auto arg1 = sycl::detail::convertToOpenCLType(std::move(t1)); \ auto arg2 = sycl::detail::convertToOpenCLType(std::move(t2)); \ auto arg3 = sycl::detail::convertToOpenCLType(std::move(t3)); \ - using Ret = sycl::detail::ConvertToOpenCLType_t; \ - __SYCL_EXTERN_IT3(Ret, prefix, call, arg1, arg2, arg3); \ - Ret ret = __SYCL_PPCAT(prefix, call)(std::move(arg1), std::move(arg2), \ - std::move(arg3)); \ - return sycl::detail::convertDataToType(std::move(ret)); \ + __SYCL_EXTERN_IT3(R, prefix, call, arg1, arg2, arg3); \ + return sycl::detail::convertFromOpenCLTypeFor(__SYCL_PPCAT( \ + prefix, call)(std::move(arg1), std::move(arg2), std::move(arg3))); \ } #ifndef __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index ea36fcf7cd669..7cf893778394c 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -704,6 +704,18 @@ template auto convertToOpenCLType(T &&x) { return convertDataToType(std::forward(x)); } +template auto convertFromOpenCLTypeFor(From &&x) { + if constexpr (std::is_same_v && + std::is_same_v, bool>) { + // FIXME: Something seems to be wrong elsewhere... + return x; + } else { + static_assert(std::is_same_v, + ConvertToOpenCLType_t>); + return convertDataToType(std::forward(x)); + } +} + // Used for all, any and select relational built-in functions template inline constexpr T msbMask(T) { using UT = make_unsigned_t; diff --git a/sycl/include/sycl/detail/image_ocl_types.hpp b/sycl/include/sycl/detail/image_ocl_types.hpp index 57f17a33bd13f..95de2eabd1042 100644 --- a/sycl/include/sycl/detail/image_ocl_types.hpp +++ b/sycl/include/sycl/detail/image_ocl_types.hpp @@ -47,9 +47,8 @@ template struct sampled_opencl_image_type; #define __SYCL_INVOKE_SPIRV_CALL_ARG1(call) \ template inline R __invoke_##call(T1 ParT1) { \ using Ret = sycl::detail::ConvertToOpenCLType_t; \ - T1 Arg1 = ParT1; \ - Ret RetVar = __spirv_##call(Arg1); \ - return sycl::detail::convertDataToType(RetVar); \ + return sycl::detail::convertFromOpenCLTypeFor( \ + __spirv_##call(ParT1)); \ } // The macro defines the function __invoke_ImageXXXX, @@ -74,12 +73,10 @@ static RetType __invoke__ImageRead(ImageT Img, CoordT Coords) { // Convert from sycl types to builtin types to get correct function mangling. using TempRetT = sycl::detail::ConvertToOpenCLType_t; - auto TmpCoords = sycl::detail::convertToOpenCLType(Coords); - TempRetT Ret = - __spirv_ImageRead(Img, TmpCoords); - return sycl::detail::convertDataToType(Ret); + return sycl::detail::convertFromOpenCLTypeFor( + __spirv_ImageRead(Img, TmpCoords)); } template @@ -98,10 +95,9 @@ static RetType __invoke__ImageReadLod(SmpImageT SmpImg, CoordT Coords, // Sampled Image must be an object whose type is OpTypeSampledImage // Image Operands encodes what operands follow. Either Lod // or Grad image operands must be present - TempRetT Ret = + return sycl::detail::convertFromOpenCLTypeFor( __spirv_ImageSampleExplicitLod( - SmpImg, TmpCoords, ImageOperands::Lod, Level); - return sycl::detail::convertDataToType(Ret); + SmpImg, TmpCoords, ImageOperands::Lod, Level)); } template @@ -122,10 +118,9 @@ static RetType __invoke__ImageReadGrad(SmpImageT SmpImg, CoordT Coords, // Sampled Image must be an object whose type is OpTypeSampledImage // Image Operands encodes what operands follow. Either Lod // or Grad image operands must be present - TempRetT Ret = + return sycl::detail::convertFromOpenCLTypeFor( __spirv_ImageSampleExplicitLod( - SmpImg, TmpCoords, ImageOperands::Grad, TmpGraddX, TmpGraddY); - return sycl::detail::convertDataToType(Ret); + SmpImg, TmpCoords, ImageOperands::Grad, TmpGraddX, TmpGraddY)); } template @@ -150,11 +145,10 @@ static RetType __invoke__ImageReadSampler(ImageT Img, CoordT Coords, enum ImageOperands { Lod = 0x2 }; // Lod value is zero as mipmap is not supported. - TempRetT Ret = + return sycl::detail::convertFromOpenCLTypeFor( __spirv_ImageSampleExplicitLod( __spirv_SampledImage(Img, Smpl), TmpCoords, - ImageOperands::Lod, 0.0f); - return sycl::detail::convertDataToType(Ret); + ImageOperands::Lod, 0.0f)); } namespace sycl { diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index 3054e661b4a32..26ff67e54a2a8 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -95,8 +95,7 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t< sycl::detail::is_svgenfloatf_v || sycl::detail::is_svgenfloath_v, T> tanh(T x) __NOEXC { #if defined(__NVPTX__) - using _ocl_T = sycl::detail::ConvertToOpenCLType_t; - return sycl::detail::convertDataToType<_ocl_T, T>( + return sycl::detail::convertFromOpenCLTypeFor( __clc_native_tanh(sycl::detail::convertToOpenCLType(x))); #else return __sycl_std::__invoke_tanh(x); @@ -145,8 +144,7 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t, T> exp2(T x) __NOEXC { #if defined(__NVPTX__) - using _ocl_T = sycl::detail::ConvertToOpenCLType_t; - return sycl::detail::convertDataToType<_ocl_T, T>( + return sycl::detail::convertFromOpenCLTypeFor( __clc_native_exp2(sycl::detail::convertToOpenCLType(x))); #else return __sycl_std::__invoke_exp2(x); From b06cfb57dc88ff6a9015680410478a8dcb0776d0 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 12 Feb 2024 13:17:41 -0800 Subject: [PATCH 4/6] [SYCL] Fix remove_decoration for cv-qual decorated ponters/refs (#12691) Previous implementation was incorrect because multiple specialization were matching resulting in an ambiguity. --- sycl/include/sycl/access/access.hpp | 65 ++++++++++++++------------- sycl/test/type_traits/type_traits.cpp | 18 ++++++++ 2 files changed, 53 insertions(+), 30 deletions(-) diff --git a/sycl/include/sycl/access/access.hpp b/sycl/include/sycl/access/access.hpp index ca8bda18eecb7..3e6820a521643 100644 --- a/sycl/include/sycl/access/access.hpp +++ b/sycl/include/sycl/access/access.hpp @@ -256,10 +256,44 @@ struct deduce_AS }; #endif +template struct remove_decoration_impl { + using type = T; +}; + +#ifdef __SYCL_DEVICE_ONLY__ +template struct remove_decoration_impl<__OPENCL_GLOBAL_AS__ T> { + using type = T; +}; + +#ifdef __ENABLE_USM_ADDR_SPACE__ +template +struct remove_decoration_impl<__OPENCL_GLOBAL_DEVICE_AS__ T> { + using type = T; +}; + +template +struct remove_decoration_impl<__OPENCL_GLOBAL_HOST_AS__ T> { + using type = T; +}; + +#endif // __ENABLE_USM_ADDR_SPACE__ + +template struct remove_decoration_impl<__OPENCL_PRIVATE_AS__ T> { + using type = T; +}; + +template struct remove_decoration_impl<__OPENCL_LOCAL_AS__ T> { + using type = T; +}; + +template struct remove_decoration_impl<__OPENCL_CONSTANT_AS__ T> { + using type = T; +}; +#endif // __SYCL_DEVICE_ONLY__ } // namespace detail template struct remove_decoration { - using type = T; + using type = typename detail::remove_decoration_impl::type; }; // Propagate through const qualifier. @@ -287,35 +321,6 @@ template struct remove_decoration { using type = const typename remove_decoration::type &; }; -#ifdef __SYCL_DEVICE_ONLY__ -template struct remove_decoration<__OPENCL_GLOBAL_AS__ T> { - using type = T; -}; - -#ifdef __ENABLE_USM_ADDR_SPACE__ -template struct remove_decoration<__OPENCL_GLOBAL_DEVICE_AS__ T> { - using type = T; -}; - -template struct remove_decoration<__OPENCL_GLOBAL_HOST_AS__ T> { - using type = T; -}; - -#endif // __ENABLE_USM_ADDR_SPACE__ - -template struct remove_decoration<__OPENCL_PRIVATE_AS__ T> { - using type = T; -}; - -template struct remove_decoration<__OPENCL_LOCAL_AS__ T> { - using type = T; -}; - -template struct remove_decoration<__OPENCL_CONSTANT_AS__ T> { - using type = T; -}; -#endif // __SYCL_DEVICE_ONLY__ - template using remove_decoration_t = typename remove_decoration::type; diff --git a/sycl/test/type_traits/type_traits.cpp b/sycl/test/type_traits/type_traits.cpp index 52de6a18570ef..3c53822517d62 100644 --- a/sycl/test/type_traits/type_traits.cpp +++ b/sycl/test/type_traits/type_traits.cpp @@ -227,5 +227,23 @@ int main() { test_is_same_vector_size, s::int2>(); test_is_same_vector_size, float>(); +#ifdef __SYCL_DEVICE_ONLY__ + static_assert( + std::is_same_v< + s::remove_decoration_t, + const int>); + static_assert( + std::is_same_v, + const volatile int>); + static_assert( + std::is_same_v< + s::remove_decoration_t, + const int *>); + static_assert(std::is_same_v, + const int *const>); +#endif + return 0; } From 0c48d9c27de1cc2d9c080dcd9b5370d75de1b8c1 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 12 Feb 2024 16:10:22 -0800 Subject: [PATCH 5/6] [NFCI][SYCL] Use convertToOpenCLType in more places (#12692) Follow-up for https://github.com/intel/llvm/pull/12674, updating places where `ConvertToOpenCLType_t` was used with a plain cast instead of `convertDataToType`. Not touching `multi_ptr` related uses just yet. --- sycl/include/sycl/detail/spirv.hpp | 14 ++++++-------- .../ext/oneapi/experimental/non_uniform_groups.hpp | 4 +--- sycl/include/sycl/ext/oneapi/sub_group_mask.hpp | 5 ++--- sycl/include/sycl/group.hpp | 2 +- 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp index 5a1215dbe2f8a..54bb7d229c372 100644 --- a/sycl/include/sycl/detail/spirv.hpp +++ b/sycl/include/sycl/detail/spirv.hpp @@ -803,8 +803,7 @@ EnableIfGenericShuffle SubgroupShuffleUp(T x, uint32_t delta); template EnableIfNativeShuffle SubgroupShuffle(T x, id<1> local_id) { #ifndef __NVPTX__ - using OCLT = detail::ConvertToOpenCLType_t; - return __spirv_SubgroupShuffleINTEL(OCLT(x), + return __spirv_SubgroupShuffleINTEL(convertToOpenCLType(x), static_cast(local_id.get(0))); #else return __nvvm_shfl_sync_idx_i32(membermask(), x, local_id.get(0), 0x1f); @@ -814,9 +813,8 @@ EnableIfNativeShuffle SubgroupShuffle(T x, id<1> local_id) { template EnableIfNativeShuffle SubgroupShuffleXor(T x, id<1> local_id) { #ifndef __NVPTX__ - using OCLT = detail::ConvertToOpenCLType_t; return __spirv_SubgroupShuffleXorINTEL( - OCLT(x), static_cast(local_id.get(0))); + convertToOpenCLType(x), static_cast(local_id.get(0))); #else return __nvvm_shfl_sync_bfly_i32(membermask(), x, local_id.get(0), 0x1f); #endif @@ -825,8 +823,8 @@ EnableIfNativeShuffle SubgroupShuffleXor(T x, id<1> local_id) { template EnableIfNativeShuffle SubgroupShuffleDown(T x, uint32_t delta) { #ifndef __NVPTX__ - using OCLT = detail::ConvertToOpenCLType_t; - return __spirv_SubgroupShuffleDownINTEL(OCLT(x), OCLT(x), delta); + return __spirv_SubgroupShuffleDownINTEL(convertToOpenCLType(x), + convertToOpenCLType(x), delta); #else return __nvvm_shfl_sync_down_i32(membermask(), x, delta, 0x1f); #endif @@ -835,8 +833,8 @@ EnableIfNativeShuffle SubgroupShuffleDown(T x, uint32_t delta) { template EnableIfNativeShuffle SubgroupShuffleUp(T x, uint32_t delta) { #ifndef __NVPTX__ - using OCLT = detail::ConvertToOpenCLType_t; - return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x), delta); + return __spirv_SubgroupShuffleUpINTEL(convertToOpenCLType(x), + convertToOpenCLType(x), delta); #else return __nvvm_shfl_sync_up_i32(membermask(), x, delta, 0); #endif diff --git a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp index a2f2bfb321842..af68ce0e10e0f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/non_uniform_groups.hpp @@ -34,11 +34,9 @@ inline sycl::vec ExtractMask(ext::oneapi::sub_group_mask Mask) { // TODO: This may need to be generalized beyond uint32_t for big masks inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) { sycl::vec MemberMask = ExtractMask(Mask); - auto OCLMask = - sycl::detail::ConvertToOpenCLType_t>(MemberMask); return __spirv_GroupNonUniformBallotBitCount( __spv::Scope::Subgroup, (int)__spv::GroupOperation::ExclusiveScan, - OCLMask); + sycl::detail::convertToOpenCLType(MemberMask)); } #endif diff --git a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp index bf28a1792ce86..a72c517fe27b9 100644 --- a/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp +++ b/sycl/include/sycl/ext/oneapi/sub_group_mask.hpp @@ -138,10 +138,9 @@ struct sub_group_mask { for (int i = 0; i < 4; ++i) { MemberMask[i] = TmpMArray[i]; } - auto OCLMask = - sycl::detail::ConvertToOpenCLType_t>(MemberMask); return __spirv_GroupNonUniformBallotBitCount( - __spv::Scope::Subgroup, (int)__spv::GroupOperation::Reduce, OCLMask); + __spv::Scope::Subgroup, (int)__spv::GroupOperation::Reduce, + sycl::detail::convertToOpenCLType(MemberMask)); #else unsigned int count = 0; auto word = (Bits & valuable_bits(bits_num)); diff --git a/sycl/include/sycl/group.hpp b/sycl/include/sycl/group.hpp index 39b1f0a0293c8..ceb0c58dcf99c 100644 --- a/sycl/include/sycl/group.hpp +++ b/sycl/include/sycl/group.hpp @@ -14,7 +14,7 @@ #include // for NDLoop, __SYCL_ASSERT #include // for __SYCL_TYPE #include // for __SYCL2020_DEPRECATED -#include // for ConvertToOpenCLType_t +#include // for convertToOpenCLType #include // for Builder, getSPIRVMemo... #include // for id, range #include // for is_bool, change_base_... From c872cadd132589f7c15d4f38a28193153ee85697 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 13 Feb 2024 09:24:05 +0100 Subject: [PATCH 6/6] [SYCL][Fusion][NFC] Comment fusion abortion on inter-fusion dependency (#12678) Add comment stating event dependencies on any commands part of a different active fusion leads to the abortion of that fusion. --------- Signed-off-by: Victor Perez --- sycl/source/detail/scheduler/graph_builder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 5a26623ce4003..e9ed3f2bb6e4b 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -966,8 +966,8 @@ Scheduler::GraphBuildResult Scheduler::GraphBuilder::addCG( ++Ev; continue; } - // Handle event dependencies on any commands part of another active - // fusion. + // Event dependencies on commands part of another active fusion are + // handled by cancelling fusion in that other queue. if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) { printFusionWarning( "Aborting fusion because of event dependency from a "