From e7defabdcc3d5b460cfc593822156836b874f092 Mon Sep 17 00:00:00 2001 From: Udit Agarwal Date: Wed, 12 Jun 2024 15:00:07 -0700 Subject: [PATCH] [SYCL] Use `std::array` as storage for `sycl::vec` on device (#14130) Replaces https://github.com/intel/llvm/pull/13270 Changing the storage to std::array instead of Clang's extension fixes strict ansi-aliasing violation and simplifies device code. --- .../sycl/detail/generic_type_traits.hpp | 6 + sycl/include/sycl/detail/vector_arith.hpp | 170 ++-- sycl/include/sycl/detail/vector_convert.hpp | 9 + sycl/include/sycl/ext/oneapi/bfloat16.hpp | 2 + sycl/include/sycl/half_type.hpp | 6 + sycl/include/sycl/vector_preview.hpp | 897 ++++-------------- .../vec_binary_scalar_order_relational.cpp | 3 - sycl/test/abi/layout_vec.cpp | 19 + .../vector/vector_math_ops.cpp | 371 ++++---- 9 files changed, 488 insertions(+), 995 deletions(-) diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index a58493877c3c4..4db558e720d7e 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -342,6 +342,8 @@ template auto convertToOpenCLType(T &&x) { std::declval()))>, no_ref::size()>; #ifdef __SYCL_DEVICE_ONLY__ + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // TODO: for some mysterious reasons on NonUniformGroups E2E tests fail if // we use the "else" version only. I suspect that's an issues with // non-uniform groups implementation. @@ -350,6 +352,10 @@ template auto convertToOpenCLType(T &&x) { else return static_cast( x.template as()); +#else // __INTEL_PREVIEW_BREAKING_CHANGES + return sycl::bit_cast(x); +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + #else return x.template as(); #endif diff --git a/sycl/include/sycl/detail/vector_arith.hpp b/sycl/include/sycl/detail/vector_arith.hpp index 5cc54d383016e..4ebd18c28259f 100644 --- a/sycl/include/sycl/detail/vector_arith.hpp +++ b/sycl/include/sycl/detail/vector_arith.hpp @@ -50,15 +50,14 @@ using rel_t = typename std::conditional_t< friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs, \ const vec_t & Rhs) { \ vec_t Ret; \ - if constexpr (vec_t::IsUsingArrayOnDevice) { \ + if constexpr (vec_t::IsBfloat16) { \ for (size_t I = 0; I < NumElements; ++I) { \ - detail::VecAccess::setValue( \ - Ret, I, \ - (detail::VecAccess::getValue(Lhs, I) \ - BINOP detail::VecAccess::getValue(Rhs, I))); \ + Ret[I] = Lhs[I] BINOP Rhs[I]; \ } \ } else { \ - Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \ + auto ExtVecLhs = sycl::bit_cast(Lhs); \ + auto ExtVecRhs = sycl::bit_cast(Rhs); \ + Ret = vec(ExtVecLhs BINOP ExtVecRhs); \ if constexpr (std::is_same_v && CONVERT) { \ vec_arith_common::ConvertToDataT(Ret); \ } \ @@ -72,13 +71,9 @@ using rel_t = typename std::conditional_t< friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs, \ const vec_t & Rhs) { \ vec_t Ret{}; \ - for (size_t I = 0; I < NumElements; ++I) \ - detail::VecAccess::setValue( \ - Ret, I, \ - (DataT)(vec_data::get( \ - detail::VecAccess::getValue(Lhs, I)) \ - BINOP vec_data::get( \ - detail::VecAccess::getValue(Rhs, I)))); \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret[I] = Lhs[I] BINOP Rhs[I]; \ + } \ return Ret; \ } #endif // __SYCL_DEVICE_ONLY__ @@ -130,83 +125,78 @@ template class vec_arith : public vec_arith_common { protected: using vec_t = vec; - using ocl_t = rel_t; + using ocl_t = detail::select_cl_scalar_integral_signed_t; template using vec_data = vec_helper; // operator!. - friend vec, NumElements> operator!(const vec_t &Rhs) { - if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) { - vec_t Ret{}; + friend vec operator!(const vec_t &Rhs) { +#ifdef __SYCL_DEVICE_ONLY__ + if constexpr (!vec_t::IsBfloat16) { + auto extVec = sycl::bit_cast(Rhs); + vec Ret{ + (typename vec::vector_t) !extVec}; + return Ret; + } else +#endif // __SYCL_DEVICE_ONLY__ + { + vec Ret{}; for (size_t I = 0; I < NumElements; ++I) { - detail::VecAccess::setValue( - Ret, I, - !vec_data::get(detail::VecAccess::getValue(Rhs, I))); + // static_cast will work here as the output of ! operator is either 0 or + // -1. + Ret[I] = static_cast(-1 * (!Rhs[I])); } - return Ret.template as, NumElements>>(); - } else { - return vec_t{(typename vec::DataType) !Rhs.m_Data} - .template as, NumElements>>(); + return Ret; } } // operator +. friend vec_t operator+(const vec_t &Lhs) { - if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) { - vec_t Ret{}; - for (size_t I = 0; I < NumElements; ++I) - detail::VecAccess::setValue( - Ret, I, - vec_data::get(+vec_data::get( - detail::VecAccess::getValue(Lhs, I)))); - return Ret; - } else { - return vec_t{+Lhs.m_Data}; - } +#ifdef __SYCL_DEVICE_ONLY__ + auto extVec = sycl::bit_cast(Lhs); + return vec_t{+extVec}; +#else + vec_t Ret{}; + for (size_t I = 0; I < NumElements; ++I) + Ret[I] = +Lhs[I]; + return Ret; +#endif } // operator -. friend vec_t operator-(const vec_t &Lhs) { - namespace oneapi = sycl::ext::oneapi; vec_t Ret{}; - if constexpr (vec_t::IsBfloat16 && NumElements == 1) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data); - oneapi::bfloat16 w = -v; - Ret.m_Data = oneapi::detail::bfloat16ToBits(w); - } else if constexpr (vec_t::IsBfloat16) { - for (size_t I = 0; I < NumElements; I++) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]); - oneapi::bfloat16 w = -v; - Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w); - } - } else if constexpr (vec_t::IsUsingArrayOnDevice || - vec_t::IsUsingArrayOnHost) { - for (size_t I = 0; I < NumElements; ++I) - detail::VecAccess::setValue( - Ret, I, - vec_data::get(-vec_data::get( - detail::VecAccess::getValue(Lhs, I)))); - return Ret; + if constexpr (vec_t::IsBfloat16) { + for (size_t I = 0; I < NumElements; I++) + Ret[I] = -Lhs[I]; } else { - Ret = vec_t{-Lhs.m_Data}; +#ifndef __SYCL_DEVICE_ONLY__ + for (size_t I = 0; I < NumElements; ++I) + Ret[I] = -Lhs[I]; +#else + auto extVec = sycl::bit_cast(Lhs); + Ret = vec_t{-extVec}; if constexpr (std::is_same_v) { vec_arith_common::ConvertToDataT(Ret); } - return Ret; +#endif } + return Ret; } // Unary operations on sycl::vec +// FIXME: Don't allow Unary operators on vec after +// https://github.com/KhronosGroup/SYCL-CTS/issues/896 gets fixed. #ifdef __SYCL_UOP #error "Undefine __SYCL_UOP macro" #endif #define __SYCL_UOP(UOP, OPASSIGN) \ friend vec_t &operator UOP(vec_t & Rhs) { \ - Rhs OPASSIGN vec_data::get(1); \ + Rhs OPASSIGN DataT{1}; \ return Rhs; \ } \ friend vec_t operator UOP(vec_t &Lhs, int) { \ vec_t Ret(Lhs); \ - Lhs OPASSIGN vec_data::get(1); \ + Lhs OPASSIGN DataT{1}; \ return Ret; \ } @@ -228,25 +218,24 @@ class vec_arith : public vec_arith_common { friend std::enable_if_t<(COND), vec> operator RELLOGOP( \ const vec_t & Lhs, const vec_t & Rhs) { \ vec Ret{}; \ - /* This special case is needed since there are no standard operator|| */ \ - /* or operator&& functions for std::array. */ \ - if constexpr (vec_t::IsUsingArrayOnDevice && \ - (std::string_view(#RELLOGOP) == "||" || \ - std::string_view(#RELLOGOP) == "&&")) { \ + /* ext_vector_type does not support bfloat16, so for these */ \ + /* we do element-by-element operation on the underlying std::array. */ \ + if constexpr (vec_t::IsBfloat16) { \ for (size_t I = 0; I < NumElements; ++I) { \ - /* We cannot use SetValue here as the operator is not a friend of*/ \ - /* Ret on Windows. */ \ - Ret[I] = static_cast( \ - -(vec_data::get(detail::VecAccess::getValue(Lhs, I)) \ - RELLOGOP vec_data::get( \ - detail::VecAccess::getValue(Rhs, I)))); \ + Ret[I] = static_cast(-(Lhs[I] RELLOGOP Rhs[I])); \ } \ } else { \ + auto ExtVecLhs = sycl::bit_cast(Lhs); \ + auto ExtVecRhs = sycl::bit_cast(Rhs); \ + /* Cast required to convert unsigned char ext_vec_type to */ \ + /* char ext_vec_type. */ \ Ret = vec( \ (typename vec::vector_t)( \ - Lhs.m_Data RELLOGOP Rhs.m_Data)); \ - if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/ \ + ExtVecLhs RELLOGOP ExtVecRhs)); \ + /* For NumElements == 1, we use scalar instead of ext_vector_type. */ \ + if constexpr (NumElements == 1) { \ Ret *= -1; \ + } \ } \ return Ret; \ } @@ -257,12 +246,7 @@ class vec_arith : public vec_arith_common { const vec_t & Lhs, const vec_t & Rhs) { \ vec Ret{}; \ for (size_t I = 0; I < NumElements; ++I) { \ - /* We cannot use SetValue here as the operator is not a friend of*/ \ - /* Ret on Windows. */ \ - Ret[I] = static_cast( \ - -(vec_data::get(detail::VecAccess::getValue(Lhs, I)) \ - RELLOGOP vec_data::get( \ - detail::VecAccess::getValue(Rhs, I)))); \ + Ret[I] = static_cast(-(Lhs[I] RELLOGOP Rhs[I])); \ } \ return Ret; \ } @@ -376,25 +360,28 @@ template class vec_arith_common { protected: using vec_t = vec; + static constexpr bool IsBfloat16 = + std::is_same_v; + // operator~() available only when: dataT != float && dataT != double // && dataT != half template friend std::enable_if_t, vec_t> operator~(const vec_t &Rhs) { - if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) { - vec_t Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - detail::VecAccess::setValue( - Ret, I, ~detail::VecAccess::getValue(Rhs, I)); - } - return Ret; - } else { - vec_t Ret{(typename vec_t::DataType) ~Rhs.m_Data}; - if constexpr (std::is_same_v) { - vec_arith_common::ConvertToDataT(Ret); - } - return Ret; +#ifdef __SYCL_DEVICE_ONLY__ + auto extVec = sycl::bit_cast(Rhs); + vec_t Ret{~extVec}; + if constexpr (std::is_same_v) { + ConvertToDataT(Ret); } + return Ret; +#else + vec_t Ret{}; + for (size_t I = 0; I < NumElements; ++I) { + Ret[I] = ~Rhs[I]; + } + return Ret; +#endif } #ifdef __SYCL_DEVICE_ONLY__ @@ -402,8 +389,7 @@ template class vec_arith_common { // Required only for std::bool. static void ConvertToDataT(vec_bool_t &Ret) { for (size_t I = 0; I < NumElements; ++I) { - DataT Tmp = detail::VecAccess::getValue(Ret, I); - detail::VecAccess::setValue(Ret, I, Tmp); + Ret[I] = bit_cast(Ret[I]) != 0; } } #endif diff --git a/sycl/include/sycl/detail/vector_convert.hpp b/sycl/include/sycl/detail/vector_convert.hpp index c018fce5bcfa3..6552daa560e9a 100644 --- a/sycl/include/sycl/detail/vector_convert.hpp +++ b/sycl/include/sycl/detail/vector_convert.hpp @@ -558,6 +558,15 @@ NativeToT convertImpl(NativeFromT Value) { } } +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) +template +auto ConvertImpl(std::byte val) { + return convertImpl( + (std::int8_t)val); +} +#endif + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp index 1c30b8ad0a2e9..9bb2b659e69b9 100644 --- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp +++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp @@ -102,6 +102,7 @@ template void FloatVecToBF16Vec(float src[N], bfloat16 dst[N]) { // sycl::vec support namespace bf16 { +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES #ifdef __SYCL_DEVICE_ONLY__ using Vec2StorageT = Bfloat16StorageT __attribute__((ext_vector_type(2))); using Vec3StorageT = Bfloat16StorageT __attribute__((ext_vector_type(3))); @@ -115,6 +116,7 @@ using Vec4StorageT = std::array; using Vec8StorageT = std::array; using Vec16StorageT = std::array; #endif +#endif // __INTEL_PREVIEW_BREAKING_CHANGES } // namespace bf16 } // namespace detail diff --git a/sycl/include/sycl/half_type.hpp b/sycl/include/sycl/half_type.hpp index 951146f2cdfbb..799ff9fb186e9 100644 --- a/sycl/include/sycl/half_type.hpp +++ b/sycl/include/sycl/half_type.hpp @@ -249,11 +249,14 @@ using StorageT = _Float16; using BIsRepresentationT = _Float16; using VecElemT = _Float16; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES using Vec2StorageT = VecElemT __attribute__((ext_vector_type(2))); using Vec3StorageT = VecElemT __attribute__((ext_vector_type(3))); using Vec4StorageT = VecElemT __attribute__((ext_vector_type(4))); using Vec8StorageT = VecElemT __attribute__((ext_vector_type(8))); using Vec16StorageT = VecElemT __attribute__((ext_vector_type(16))); +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + #else // SYCL_DEVICE_ONLY using StorageT = detail::host_half_impl::half; // No need to extract underlying data type for built-in functions operating on @@ -261,6 +264,7 @@ using StorageT = detail::host_half_impl::half; using BIsRepresentationT = half; using VecElemT = half; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // On the host side we cannot use OpenCL cl_half# types as an underlying type // for vec because they are actually defined as an integer type under the // hood. As a result half values will be converted to the integer and passed @@ -270,6 +274,8 @@ using Vec3StorageT = std::array; using Vec4StorageT = std::array; using Vec8StorageT = std::array; using Vec16StorageT = std::array; +#endif // __INTEL_PREVIEW_BREAKING_CHANGES + #endif // SYCL_DEVICE_ONLY #ifndef __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/sycl/vector_preview.hpp b/sycl/include/sycl/vector_preview.hpp index 3d52b297c0ef2..2aac07d56abf8 100644 --- a/sycl/include/sycl/vector_preview.hpp +++ b/sycl/include/sycl/vector_preview.hpp @@ -42,6 +42,7 @@ #include // bfloat16 +#include // for std::min #include // for array #include // for assert #include // for size_t, NULL, byte @@ -83,88 +84,27 @@ struct elem { }; namespace detail { -// select_apply_cl_t selects from T8/T16/T32/T64 basing on -// sizeof(_IN). expected to handle scalar types in _IN. -template -using select_apply_cl_t = std::conditional_t< - sizeof(_IN) == 1, T8, - std::conditional_t>>; - -template struct vec_helper { - using RetType = T; - static constexpr RetType get(T value) { return value; } - static constexpr RetType set(T value) { return value; } -}; -template <> struct vec_helper { - using RetType = select_apply_cl_t; - static constexpr RetType get(bool value) { return value; } - static constexpr RetType set(bool value) { return value; } -}; - -template <> struct vec_helper { - using RetType = sycl::ext::oneapi::bfloat16; - using BFloat16StorageT = sycl::ext::oneapi::detail::Bfloat16StorageT; - static constexpr RetType get(BFloat16StorageT value) { -#if defined(__SYCL_BITCAST_IS_CONSTEXPR) - return sycl::bit_cast(value); -#else - // awkward workaround. sycl::bit_cast isn't constexpr in older GCC - // C++20 will give us both std::bit_cast and constexpr reinterpet for void* - // but neither available yet. - union { - sycl::ext::oneapi::bfloat16 bf16; - sycl::ext::oneapi::detail::Bfloat16StorageT storage; - } result = {}; - result.storage = value; - return result.bf16; -#endif - } - - static constexpr RetType get(RetType value) { return value; } - - static constexpr BFloat16StorageT set(RetType value) { -#if defined(__SYCL_BITCAST_IS_CONSTEXPR) - return sycl::bit_cast(value); -#else - union { - sycl::ext::oneapi::bfloat16 bf16; - sycl::ext::oneapi::detail::Bfloat16StorageT storage; - } result = {}; - result.bf16 = value; - return result.storage; -#endif - } -}; - -#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) -template <> struct vec_helper { - using RetType = std::uint8_t; - static constexpr RetType get(std::byte value) { return (RetType)value; } - static constexpr RetType set(std::byte value) { return (RetType)value; } - static constexpr std::byte get(std::uint8_t value) { - return (std::byte)value; - } - static constexpr std::byte set(std::uint8_t value) { - return (std::byte)value; - } -}; -#endif - template class OperationCurrentT, int... Indexes> class SwizzleOp; -template struct VecStorage; - // Special type indicating that SwizzleOp should just read value from vector - // not trying to perform any operations. Should not be called. template class GetOp { public: using DataT = T; - DataT getValue(size_t) const { return (DataT)0; } - DataT operator()(DataT, DataT) { return (DataT)0; } + DataT getValue(size_t) const { + if constexpr (std::is_same_v) + return DataT{0.0f}; + else + return (DataT)0; + } + DataT operator()(DataT, DataT) { + if constexpr (std::is_same_v) + return DataT{0.0f}; + else + return (DataT)0; + } }; // Forward declarations @@ -173,197 +113,32 @@ class RoundedRangeKernel; template class RoundedRangeKernelWithKH; -// Vectors of size 1 are handled separately and therefore 1 is not included in -// the check below. -constexpr bool isValidVectorSize(int N) { - return N == 2 || N == 3 || N == 4 || N == 8 || N == 16; -} -template struct VecStorage { - static_assert( - isValidVectorSize(N) || N == 1, - "Incorrect number of elements for sycl::vec: only 1, 2, 3, 4, 8 " - "or 16 are supported"); - static_assert(!std::is_same_v, "Incorrect data type for sycl::vec"); -}; - -#ifdef __SYCL_DEVICE_ONLY__ -// device always has ext vector support, but for huge vectors -// we switch to std::array, so that we can use a smaller alignment (64) -// this is to support MSVC, which has a max of 64 for direct params. -template struct VecStorageImpl { - static constexpr size_t Num = (N == 3) ? 4 : N; - static constexpr size_t Sz = Num * sizeof(T); - using DataType = - typename std::conditional>::type; - using VectorDataType = T __attribute__((ext_vector_type(N))); -}; -#else // __SYCL_DEVICE_ONLY__ -template struct VecStorageImpl { - using DataType = std::array; -}; -#endif // __SYCL_DEVICE_ONLY__ - -// Single element bool -template <> struct VecStorage { - using DataType = bool; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = bool; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Multiple element bool -template -struct VecStorage> { - using DataType = - typename VecStorageImpl, - N>::DataType; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = - typename VecStorageImpl, - N>::VectorDataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) -// Single element byte. Multiple elements will propagate through a later -// specialization. -template <> struct VecStorage { - using DataType = std::int8_t; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = std::int8_t; -#endif // __SYCL_DEVICE_ONLY__ -}; -#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) - -// Single element signed integers +// OpenCL data type to convert to. template -struct VecStorage>> { - using DataType = T; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = DataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Single element unsigned integers -template -struct VecStorage>> { - using DataType = T; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = DataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Single element floating-point (except half/bfloat16) -template -struct VecStorage< - T, 1, - typename std::enable_if_t && is_sgenfloat_v>> { - using DataType = T; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = DataType; -#endif // __SYCL_DEVICE_ONLY__ -}; -// Multiple elements signed/unsigned integers and floating-point (except -// half/bfloat16) -template -struct VecStorage< - T, N, - typename std::enable_if_t || - (is_sgenfloat_v && !is_half_or_bf16_v))>> { - using DataType = - typename VecStorageImpl::DataType, N>::DataType; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = - typename VecStorageImpl::DataType, - N>::VectorDataType; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Single element half -template <> struct VecStorage { - using DataType = sycl::detail::half_impl::StorageT; -#ifdef __SYCL_DEVICE_ONLY__ - using VectorDataType = sycl::detail::half_impl::StorageT; -#endif // __SYCL_DEVICE_ONLY__ -}; - -// Multiple elements half -#if defined(__SYCL_DEVICE_ONLY__) -#define __SYCL_DEFINE_HALF_VECSTORAGE(Num) \ - template <> struct VecStorage { \ - using DataType = sycl::detail::half_impl::Vec##Num##StorageT; \ - using VectorDataType = sycl::detail::half_impl::Vec##Num##StorageT; \ - }; -#else // defined(__SYCL_DEVICE_ONLY__) -#define __SYCL_DEFINE_HALF_VECSTORAGE(Num) \ - template <> struct VecStorage { \ - using DataType = sycl::detail::half_impl::Vec##Num##StorageT; \ - }; -#endif // defined(__SYCL_DEVICE_ONLY__) - -__SYCL_DEFINE_HALF_VECSTORAGE(2) -__SYCL_DEFINE_HALF_VECSTORAGE(3) -__SYCL_DEFINE_HALF_VECSTORAGE(4) -__SYCL_DEFINE_HALF_VECSTORAGE(8) -__SYCL_DEFINE_HALF_VECSTORAGE(16) -#undef __SYCL_DEFINE_HALF_VECSTORAGE - -// Single element bfloat16 -template <> struct VecStorage { - using DataType = sycl::ext::oneapi::detail::Bfloat16StorageT; - // using VectorDataType = sycl::ext::oneapi::bfloat16; - using VectorDataType = sycl::ext::oneapi::detail::Bfloat16StorageT; -}; -// Multiple elements bfloat16 -#define __SYCL_DEFINE_BF16_VECSTORAGE(Num) \ - template <> struct VecStorage { \ - using DataType = sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT; \ - using VectorDataType = \ - sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT; \ - }; -__SYCL_DEFINE_BF16_VECSTORAGE(2) -__SYCL_DEFINE_BF16_VECSTORAGE(3) -__SYCL_DEFINE_BF16_VECSTORAGE(4) -__SYCL_DEFINE_BF16_VECSTORAGE(8) -__SYCL_DEFINE_BF16_VECSTORAGE(16) -#undef __SYCL_DEFINE_BF16_VECSTORAGE - -// FIXME: Remove this class after eliminating setValue() and getValue() -// dependencies from math operations on sycl::vec. -// This class is a friend of sycl::vec and exposes getValue/setValue -// that are used by sycl::vec math operations. -template class VecAccess { -public: - template - constexpr static void setValue(VecT &v, int Index, const DataT &Value) { - if (N == 1) - v.setValue(Index, Value, 0); - else - v.setValue(Index, Value, 0.f); - } - - template - static DataT getValue(VecT v, int Index) { - return (N == 1) ? v.getValue(Index, 0) : v.getValue(Index, 0.f); - } -}; +// clang-format off +using element_type_for_vector_t = typename map_type< + T, +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + std::byte, /*->*/ std::uint8_t, +#endif + bool, /*->*/ std::int8_t, + sycl::half, /*->*/ sycl::detail::half_impl::StorageT, + sycl::ext::oneapi::bfloat16, /*->*/ sycl::ext::oneapi::detail::Bfloat16StorageT, + T, /*->*/ T>::type; +// clang-format on } // namespace detail -template using vec_data = detail::vec_helper; - -template -using vec_data_t = typename detail::vec_helper::RetType; - ///////////////////////// class sycl::vec ///////////////////////// // Provides a cross-platform vector class template that works efficiently on // SYCL devices as well as in host C++ code. -template -class vec : public detail::vec_arith { - using DataT = Type; +template +class vec : public detail::vec_arith { + + static_assert(NumElements == 1 || NumElements == 2 || NumElements == 3 || + NumElements == 4 || NumElements == 8 || NumElements == 16, + "Invalid number of elements for sycl::vec: only 1, 2, 3, 4, 8 " + "or 16 are supported"); + static_assert(sizeof(bool) == sizeof(int8_t), "bool size is not 1 byte"); // https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#memory-layout-and-alignment // It is required by the SPEC to align vec with vec. @@ -371,46 +146,20 @@ class vec : public detail::vec_arith { // This represent type of underlying value. There should be only one field // in the class, so vec should be equal to float16 in memory. - using DataType = typename detail::VecStorage::DataType; + using DataType = std::array; public: #ifdef __SYCL_DEVICE_ONLY__ // Type used for passing sycl::vec to SPIRV builtins. // We can not use ext_vector_type(1) as it's not supported by SPIRV // plugins (CTS fails). - using vector_t = - typename detail::VecStorage::VectorDataType; + using vector_t = typename std::conditional_t< + NumElements == 1, detail::element_type_for_vector_t, + detail::element_type_for_vector_t __attribute__(( + ext_vector_type(NumElements)))>; #endif // __SYCL_DEVICE_ONLY__ private: - static constexpr bool IsHostHalf = - std::is_same_v && - std::is_same_v; - - static constexpr bool IsBfloat16 = - std::is_same_v; - - static constexpr size_t Sz = sizeof(DataT) * AdjustedNum; - static constexpr bool IsSizeGreaterThanMaxAlign = - (Sz > detail::MaxVecAlignment); - - // TODO: There is no support for vector half type on host yet. - // Also, when Sz is greater than alignment, we use std::array instead of - // vector extension. This is for MSVC compatibility, which has a max alignment - // of 64 for direct params. If we drop MSVC, we can have alignment the same as - // size and use vector extensions for all sizes. - static constexpr bool IsUsingArrayOnDevice = - (IsHostHalf || IsBfloat16 || IsSizeGreaterThanMaxAlign); - -#if defined(__SYCL_DEVICE_ONLY__) - static constexpr bool NativeVec = NumElements > 1 && !IsUsingArrayOnDevice; - static constexpr bool IsUsingArrayOnHost = false; // not compiling for host. -#else - static constexpr bool NativeVec = false; - static constexpr bool IsUsingArrayOnHost = true; // host always std::array. -#endif - static constexpr int getNumElements() { return NumElements; } // SizeChecker is needed for vec(const argTN &... args) ctor to validate args. @@ -428,7 +177,7 @@ class vec : public detail::vec_arith { template static constexpr std::array VecToArray(const vec &V, std::index_sequence) { - return {static_cast(V.getValue(Is))...}; + return {static_cast(V[Is])...}; } template class T4, int... T5, std::size_t... Is> @@ -465,7 +214,7 @@ class vec : public detail::vec_arith { static constexpr auto FlattenVecArgHelper(const T &A) { // static_cast required to avoid narrowing conversion warning // when T = unsigned long int and DataT_ = int. - return std::array{vec_data::get(static_cast(A))}; + return std::array{static_cast(A)}; } template struct FlattenVecArg { constexpr auto operator()(const T &A) const { @@ -560,214 +309,85 @@ class vec : public detail::vec_arith { using EnableIfSuitableNumElements = typename std::enable_if_t::value>; - // Implementation detail for the next public ctor. - template - constexpr vec(const std::array, NumElements> &Arr, - std::index_sequence) - : m_Data{([&](vec_data_t v) constexpr { - if constexpr (std::is_same_v) - return v.value; - else - return vec_data_t(static_cast(v)); - })(Arr[Is])...} {} - public: // Aliases required by SPEC to make sycl::vec consistent // with that of marray and buffer. using element_type = DataT; using value_type = DataT; - using rel_t = detail::rel_t; + // Element type for relational operator return value. + using rel_t = detail::select_cl_scalar_integral_signed_t; /****************** Constructors **************/ vec() = default; - constexpr vec(const vec &Rhs) = default; constexpr vec(vec &&Rhs) = default; - constexpr vec &operator=(const vec &Rhs) = default; - - // W/o this, things like "vec = vec" doesn't work. - template - typename std::enable_if_t && - std::is_convertible_v, rel_t>, - vec &> - operator=(const vec &Rhs) { - *this = Rhs.template as(); - return *this; - } - -#ifdef __SYCL_DEVICE_ONLY__ - template - using EnableIfNotHostHalf = typename std::enable_if_t; - - template - using EnableIfHostHalf = typename std::enable_if_t; - - template - using EnableIfUsingArrayOnDevice = - typename std::enable_if_t; - - template - using EnableIfNotUsingArrayOnDevice = - typename std::enable_if_t; -#endif // __SYCL_DEVICE_ONLY__ - - template - using EnableIfUsingArray = - typename std::enable_if_t; - - template - using EnableIfNotUsingArray = - typename std::enable_if_t; - -#ifdef __SYCL_DEVICE_ONLY__ - - template - explicit constexpr vec(const EnableIfNotUsingArrayOnDevice &arg) - : m_Data{DataType(vec_data::get(arg))} {} - - template - typename std::enable_if_t< - std::is_fundamental_v> || - detail::is_half_or_bf16_v>, - vec &> - operator=(const EnableIfNotUsingArrayOnDevice &Rhs) { - m_Data = (DataType)vec_data::get(Rhs); - return *this; - } +private: + // Implementation detail for the next public ctor. + template + constexpr vec(const std::array &Arr, + std::index_sequence) + : m_Data{Arr[Is]...} {} - template - explicit constexpr vec(const EnableIfUsingArrayOnDevice &arg) - : vec{detail::RepeatValue( - static_cast>(arg)), +public: + explicit constexpr vec(const DataT &arg) + : vec{detail::RepeatValue(arg), std::make_index_sequence()} {} - template - typename std::enable_if_t< - std::is_fundamental_v> || - detail::is_half_or_bf16_v>, - vec &> - operator=(const EnableIfUsingArrayOnDevice &Rhs) { - for (int i = 0; i < NumElements; ++i) { - setValue(i, Rhs); - } - return *this; - } -#else // __SYCL_DEVICE_ONLY__ - explicit constexpr vec(const DataT &arg) - : vec{detail::RepeatValue( - static_cast>(arg)), + // Constructor from values of base type or vec of base type. Checks that + // base types are match and that the NumElements == sum of lengths of args. + template , + typename = EnableIfSuitableNumElements> + constexpr vec(const argTN &...args) + : vec{VecArgArrayCreator::Create(args...), std::make_index_sequence()} {} /****************** Assignment Operators **************/ + constexpr vec &operator=(const vec &Rhs) = default; // Template required to prevent ambiguous overload with the copy assignment // when NumElements == 1. The template prevents implicit conversion from // vec<_, 1> to DataT. template typename std::enable_if_t< - std::is_fundamental_v> || + std::is_fundamental_v || detail::is_half_or_bf16_v>, vec &> operator=(const DataT &Rhs) { - for (int i = 0; i < NumElements; ++i) { - setValue(i, Rhs); - } + *this = vec{Rhs}; return *this; } -#endif // __SYCL_DEVICE_ONLY__ - -#ifdef __SYCL_DEVICE_ONLY__ - // Optimized naive constructors with NumElements of DataT values. - // We don't expect compilers to optimize vararg recursive functions well. - // Helper type to make specific constructors available only for specific - // number of elements. - template - using EnableIfMultipleElems = typename std::enable_if_t< - std::is_convertible_v && NumElements == IdxNum, DataT>; - template - constexpr vec(const EnableIfMultipleElems<2, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1)} {} - template - constexpr vec(const EnableIfMultipleElems<3, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2)} {} - template - constexpr vec(const EnableIfMultipleElems<4, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, - const Ty Arg3) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2), vec_data::get(Arg3)} {} - template - constexpr vec(const EnableIfMultipleElems<8, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, - const DataT Arg3, const DataT Arg4, const DataT Arg5, - const DataT Arg6, const DataT Arg7) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2), vec_data::get(Arg3), - vec_data::get(Arg4), vec_data::get(Arg5), - vec_data::get(Arg6), vec_data::get(Arg7)} {} + // W/o this, things like "vec = vec" doesn't work. template - constexpr vec(const EnableIfMultipleElems<16, Ty> Arg0, - const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, - const DataT Arg3, const DataT Arg4, const DataT Arg5, - const DataT Arg6, const DataT Arg7, const DataT Arg8, - const DataT Arg9, const DataT ArgA, const DataT ArgB, - const DataT ArgC, const DataT ArgD, const DataT ArgE, - const DataT ArgF) - : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), - vec_data::get(Arg2), vec_data::get(Arg3), - vec_data::get(Arg4), vec_data::get(Arg5), - vec_data::get(Arg6), vec_data::get(Arg7), - vec_data::get(Arg8), vec_data::get(Arg9), - vec_data::get(ArgA), vec_data::get(ArgB), - vec_data::get(ArgC), vec_data::get(ArgD), - vec_data::get(ArgE), vec_data::get(ArgF)} {} -#endif // __SYCL_DEVICE_ONLY__ - - // Constructor from values of base type or vec of base type. Checks that - // base types are match and that the NumElements == sum of lengths of args. - template , - typename = EnableIfSuitableNumElements> - constexpr vec(const argTN &...args) - : vec{VecArgArrayCreator, argTN...>::Create(args...), - std::make_index_sequence()} {} + typename std::enable_if_t< + !std::is_same_v && std::is_convertible_v, vec &> + operator=(const vec &Rhs) { + *this = Rhs.template as(); + return *this; + } #ifdef __SYCL_DEVICE_ONLY__ - template && - !std::is_same_v>> - constexpr vec(vector_t openclVector) { - if constexpr (!IsUsingArrayOnDevice) { - m_Data = openclVector; - } else { - m_Data = bit_cast(openclVector); - } + template < + typename vector_t_ = vector_t, + typename = typename std::enable_if_t>> + constexpr vec(vector_t_ openclVector) { + m_Data = sycl::bit_cast(openclVector); } - /* Available only when: compiled for the device. + /* @SYCL2020 + * Available only when: compiled for the device. * Converts this SYCL vec instance to the underlying backend-native vector * type defined by vector_t. */ - operator vector_t() const { - if constexpr (!IsUsingArrayOnDevice) { - return m_Data; - } else { - auto ptr = bit_cast((&m_Data)->data()); - return *ptr; - } - } + operator vector_t() const { return sycl::bit_cast(m_Data); } + #endif // __SYCL_DEVICE_ONLY__ // Available only when: NumElements == 1 template operator typename std::enable_if_t() const { - return vec_data::get(m_Data); + return m_Data[0]; } __SYCL2020_DEPRECATED("get_count() is deprecated, please use size() instead") @@ -778,86 +398,89 @@ class vec : public detail::vec_arith { static constexpr size_t get_size() { return byte_size(); } static constexpr size_t byte_size() noexcept { return sizeof(m_Data); } + // We interpret bool as int8_t, std::byte as uint8_t for conversion to other + // types. + // clang-format off + template + using ConvertBoolAndByteT = typename detail::map_type< + T, +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + std::byte, /*->*/ std::uint8_t, +#endif + bool, /*->*/ std::int8_t, + T, /*->*/ T>::type; + // clang-format on + // convertImpl can't be called with the same From and To types and therefore // we need this version of convert which is mostly no-op. template - std::enable_if_t< - std::is_same_v, vec_data_t> || - std::is_same_v>, - detail::ConvertToOpenCLType_t>>, - vec> - convert() const { - static_assert(std::is_integral_v> || - detail::is_floating_point::value, - "Unsupported convertT"); - if constexpr (!std::is_same_v) { - // Dummy conversion for cases like vec -> vec - vec Result; - for (size_t I = 0; I < NumElements; ++I) - Result.setValue(I, static_cast(getValue(I))); + vec convert() const { - return Result; - } else { - // No conversion necessary - return *this; - } - } - - template - std::enable_if_t< - !std::is_same_v, vec_data_t> && - !std::is_same_v>, - detail::ConvertToOpenCLType_t>>, - vec> - convert() const { - static_assert(std::is_integral_v> || - detail::is_floating_point::value, + using T = ConvertBoolAndByteT; + using R = ConvertBoolAndByteT; + static_assert(std::is_integral_v || detail::is_floating_point::value, "Unsupported convertT"); - using T = vec_data_t; - using R = vec_data_t; + using OpenCLT = detail::ConvertToOpenCLType_t; using OpenCLR = detail::ConvertToOpenCLType_t; vec Result; -#if defined(__SYCL_DEVICE_ONLY__) - using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements))); - using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements))); - // Whole vector conversion can only be done, if: - constexpr bool canUseNativeVectorConvert = + // For conversion between bool -> signed char and byte -> uint8_t. + if constexpr (!std::is_same_v && + (std::is_same_v || std::is_same_v)) { + for (size_t I = 0; I < NumElements; ++I) + Result[I] = static_cast(getValue(I)); + return Result; + } else if constexpr (std::is_same_v) { + return *this; + } else { + +#ifdef __SYCL_DEVICE_ONLY__ + using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements))); + using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements))); + + auto NativeVector = sycl::bit_cast(*this); + using ConvertTVecType = typename vec::vector_t; + + // Whole vector conversion can only be done, if: + constexpr bool canUseNativeVectorConvert = #ifdef __NVPTX__ - // - we are not on CUDA, see intel/llvm#11840 - false && + // TODO: Likely unnecessary as + // https://github.com/intel/llvm/issues/11840 has been closed + // already. + false && #endif - // - both vectors are represented using native vector types; - NativeVec && vec::NativeVec && - // - vec storage has an equivalent OpenCL native vector it is implicitly - // convertible to. There are some corner cases where it is not the - // case with char, long and long long types. - std::is_convertible_v && - std::is_convertible_v && - // - it is not a signed to unsigned (or vice versa) conversion - // see comments within 'convertImpl' for more details; - !detail::is_sint_to_from_uint::value && - // - destination type is not bool. bool is stored as integer under the - // hood and therefore conversion to bool looks like conversion between - // two integer types. Since bit pattern for true and false is not - // defined, there is no guarantee that integer conversion yields - // right results here; - !std::is_same_v; - if constexpr (canUseNativeVectorConvert) { - Result.m_Data = detail::convertImpl(m_Data); - } else -#endif // defined(__SYCL_DEVICE_ONLY__) - { - // Otherwise, we fallback to per-element conversion: - for (size_t I = 0; I < NumElements; ++I) { - Result.setValue( - I, vec_data::get( - detail::convertImpl( - vec_data::get(getValue(I))))); + NumElements > 1 && + // - vec storage has an equivalent OpenCL native vector it is + // implicitly convertible to. There are some corner cases where it + // is not the case with char, long and long long types. + std::is_convertible_v && + std::is_convertible_v && + // - it is not a signed to unsigned (or vice versa) conversion + // see comments within 'convertImpl' for more details; + !detail::is_sint_to_from_uint::value && + // - destination type is not bool. bool is stored as integer under the + // hood and therefore conversion to bool looks like conversion + // between two integer types. Since bit pattern for true and false + // is not defined, there is no guarantee that integer conversion + // yields right results here; + !std::is_same_v; + + if constexpr (canUseNativeVectorConvert) { + Result.m_Data = sycl::bit_cast( + detail::convertImpl(NativeVector)); + } else +#endif // __SYCL_DEVICE_ONLY__ + { + // Otherwise, we fallback to per-element conversion: + for (size_t I = 0; I < NumElements; ++I) { + auto val = + detail::convertImpl( + getValue(I)); + Result[I] = static_cast(val); + } } } @@ -887,58 +510,10 @@ class vec : public detail::vec_arith { return this; } - // ext_vector_type is used as an underlying type for sycl::vec on device. - // The problem is that for clang vector types the return of operator[] is a - // temporary and not a reference to the element in the vector. In practice - // reinterpret_cast(&m_Data)[i]; is working. According to - // http://llvm.org/docs/GetElementPtr.html#can-gep-index-into-vector-elements - // this is not disallowed now. But could probably be disallowed in the future. - // That is why tests are added to check that behavior of the compiler has - // not changed. - // // Implement operator [] in the same way for host and device. - // TODO: change host side implementation when underlying type for host side - // will be changed to std::array. - // NOTE: aliasing the incompatible types of bfloat16 may lead to problems if - // aggressively optimized. Specializing with noinline to avoid as workaround. + const DataT &operator[](int i) const { return m_Data[i]; } - template - typename std::enable_if_t, - const DataT &> - operator[](int i) const { - return reinterpret_cast(&m_Data)[i]; - } - - template - typename std::enable_if_t, - DataT &> - operator[](int i) { - return reinterpret_cast(&m_Data)[i]; - } - -#ifdef _MSC_VER -#define __SYCL_NOINLINE_BF16 __declspec(noinline) -#else -#define __SYCL_NOINLINE_BF16 __attribute__((noinline)) -#endif - - template - __SYCL_NOINLINE_BF16 - typename std::enable_if_t, - const DataT &> - operator[](int i) const { - return reinterpret_cast(&m_Data)[i]; - } - - template - __SYCL_NOINLINE_BF16 - typename std::enable_if_t, - DataT &> - operator[](int i) { - return reinterpret_cast(&m_Data)[i]; - } - -#undef __SYCL_NOINLINE_BF16 + DataT &operator[](int i) { return m_Data[i]; } // Begin hi/lo, even/odd, xyzw, and rgba swizzles. private: @@ -963,8 +538,8 @@ class vec : public detail::vec_arith { template void load(size_t Offset, multi_ptr Ptr) { for (int I = 0; I < NumElements; I++) { - setValue(I, *multi_ptr( - Ptr + Offset * NumElements + I)); + m_Data[I] = *multi_ptr( + Ptr + Offset * NumElements + I); } } template @@ -989,7 +564,7 @@ class vec : public detail::vec_arith { multi_ptr Ptr) const { for (int I = 0; I < NumElements; I++) { *multi_ptr(Ptr + Offset * NumElements + - I) = getValue(I); + I) = m_Data[I]; } } template { } private: - // Generic method that execute "Operation" on underlying values. -#ifdef __SYCL_DEVICE_ONLY__ - template