Skip to content

Commit

Permalink
[SYCL] Use std::array as storage for sycl::vec on device (#14130)
Browse files Browse the repository at this point in the history
Replaces #13270

Changing the storage to std::array instead of Clang's extension fixes
strict ansi-aliasing violation and simplifies device code.
  • Loading branch information
uditagarwal97 committed Jun 12, 2024
1 parent 13a7b3a commit e7defab
Show file tree
Hide file tree
Showing 9 changed files with 488 additions and 995 deletions.
6 changes: 6 additions & 0 deletions sycl/include/sycl/detail/generic_type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,8 @@ template <typename T> auto convertToOpenCLType(T &&x) {
std::declval<ElemTy>()))>,
no_ref::size()>;
#ifdef __SYCL_DEVICE_ONLY__

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
// TODO: for some mysterious reasons on NonUniformGroups E2E tests fail if
// we use the "else" version only. I suspect that's an issues with
// non-uniform groups implementation.
Expand All @@ -350,6 +352,10 @@ template <typename T> auto convertToOpenCLType(T &&x) {
else
return static_cast<typename MatchingVec::vector_t>(
x.template as<MatchingVec>());
#else // __INTEL_PREVIEW_BREAKING_CHANGES
return sycl::bit_cast<typename MatchingVec::vector_t>(x);
#endif // __INTEL_PREVIEW_BREAKING_CHANGES

#else
return x.template as<MatchingVec>();
#endif
Expand Down
170 changes: 78 additions & 92 deletions sycl/include/sycl/detail/vector_arith.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,14 @@ using rel_t = typename std::conditional_t<
friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs, \
const vec_t & Rhs) { \
vec_t Ret; \
if constexpr (vec_t::IsUsingArrayOnDevice) { \
if constexpr (vec_t::IsBfloat16) { \
for (size_t I = 0; I < NumElements; ++I) { \
detail::VecAccess<vec_t>::setValue( \
Ret, I, \
(detail::VecAccess<vec_t>::getValue(Lhs, I) \
BINOP detail::VecAccess<vec_t>::getValue(Rhs, I))); \
Ret[I] = Lhs[I] BINOP Rhs[I]; \
} \
} else { \
Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \
auto ExtVecLhs = sycl::bit_cast<typename vec_t::vector_t>(Lhs); \
auto ExtVecRhs = sycl::bit_cast<typename vec_t::vector_t>(Rhs); \
Ret = vec<DataT, NumElements>(ExtVecLhs BINOP ExtVecRhs); \
if constexpr (std::is_same_v<DataT, bool> && CONVERT) { \
vec_arith_common<bool, NumElements>::ConvertToDataT(Ret); \
} \
Expand All @@ -72,13 +71,9 @@ using rel_t = typename std::conditional_t<
friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs, \
const vec_t & Rhs) { \
vec_t Ret{}; \
for (size_t I = 0; I < NumElements; ++I) \
detail::VecAccess<vec_t>::setValue( \
Ret, I, \
(DataT)(vec_data<DataT>::get( \
detail::VecAccess<vec_t>::getValue(Lhs, I)) \
BINOP vec_data<DataT>::get( \
detail::VecAccess<vec_t>::getValue(Rhs, I)))); \
for (size_t I = 0; I < NumElements; ++I) { \
Ret[I] = Lhs[I] BINOP Rhs[I]; \
} \
return Ret; \
}
#endif // __SYCL_DEVICE_ONLY__
Expand Down Expand Up @@ -130,83 +125,78 @@ template <typename DataT, int NumElements>
class vec_arith : public vec_arith_common<DataT, NumElements> {
protected:
using vec_t = vec<DataT, NumElements>;
using ocl_t = rel_t<DataT>;
using ocl_t = detail::select_cl_scalar_integral_signed_t<DataT>;
template <typename T> using vec_data = vec_helper<T>;

// operator!.
friend vec<rel_t<DataT>, NumElements> operator!(const vec_t &Rhs) {
if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) {
vec_t Ret{};
friend vec<ocl_t, NumElements> operator!(const vec_t &Rhs) {
#ifdef __SYCL_DEVICE_ONLY__
if constexpr (!vec_t::IsBfloat16) {
auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Rhs);
vec<ocl_t, NumElements> Ret{
(typename vec<ocl_t, NumElements>::vector_t) !extVec};
return Ret;
} else
#endif // __SYCL_DEVICE_ONLY__
{
vec<ocl_t, NumElements> Ret{};
for (size_t I = 0; I < NumElements; ++I) {
detail::VecAccess<vec_t>::setValue(
Ret, I,
!vec_data<DataT>::get(detail::VecAccess<vec_t>::getValue(Rhs, I)));
// static_cast will work here as the output of ! operator is either 0 or
// -1.
Ret[I] = static_cast<ocl_t>(-1 * (!Rhs[I]));
}
return Ret.template as<vec<rel_t<DataT>, NumElements>>();
} else {
return vec_t{(typename vec<DataT, NumElements>::DataType) !Rhs.m_Data}
.template as<vec<rel_t<DataT>, NumElements>>();
return Ret;
}
}

// operator +.
friend vec_t operator+(const vec_t &Lhs) {
if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) {
vec_t Ret{};
for (size_t I = 0; I < NumElements; ++I)
detail::VecAccess<vec_t>::setValue(
Ret, I,
vec_data<DataT>::get(+vec_data<DataT>::get(
detail::VecAccess<vec_t>::getValue(Lhs, I))));
return Ret;
} else {
return vec_t{+Lhs.m_Data};
}
#ifdef __SYCL_DEVICE_ONLY__
auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Lhs);
return vec_t{+extVec};
#else
vec_t Ret{};
for (size_t I = 0; I < NumElements; ++I)
Ret[I] = +Lhs[I];
return Ret;
#endif
}

// operator -.
friend vec_t operator-(const vec_t &Lhs) {
namespace oneapi = sycl::ext::oneapi;
vec_t Ret{};
if constexpr (vec_t::IsBfloat16 && NumElements == 1) {
oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data);
oneapi::bfloat16 w = -v;
Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
} else if constexpr (vec_t::IsBfloat16) {
for (size_t I = 0; I < NumElements; I++) {
oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]);
oneapi::bfloat16 w = -v;
Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
}
} else if constexpr (vec_t::IsUsingArrayOnDevice ||
vec_t::IsUsingArrayOnHost) {
for (size_t I = 0; I < NumElements; ++I)
detail::VecAccess<vec_t>::setValue(
Ret, I,
vec_data<DataT>::get(-vec_data<DataT>::get(
detail::VecAccess<vec_t>::getValue(Lhs, I))));
return Ret;
if constexpr (vec_t::IsBfloat16) {
for (size_t I = 0; I < NumElements; I++)
Ret[I] = -Lhs[I];
} else {
Ret = vec_t{-Lhs.m_Data};
#ifndef __SYCL_DEVICE_ONLY__
for (size_t I = 0; I < NumElements; ++I)
Ret[I] = -Lhs[I];
#else
auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Lhs);
Ret = vec_t{-extVec};
if constexpr (std::is_same_v<DataT, bool>) {
vec_arith_common<bool, NumElements>::ConvertToDataT(Ret);
}
return Ret;
#endif
}
return Ret;
}

// Unary operations on sycl::vec
// FIXME: Don't allow Unary operators on vec<bool> after
// https://github.com/KhronosGroup/SYCL-CTS/issues/896 gets fixed.
#ifdef __SYCL_UOP
#error "Undefine __SYCL_UOP macro"
#endif
#define __SYCL_UOP(UOP, OPASSIGN) \
friend vec_t &operator UOP(vec_t & Rhs) { \
Rhs OPASSIGN vec_data<DataT>::get(1); \
Rhs OPASSIGN DataT{1}; \
return Rhs; \
} \
friend vec_t operator UOP(vec_t &Lhs, int) { \
vec_t Ret(Lhs); \
Lhs OPASSIGN vec_data<DataT>::get(1); \
Lhs OPASSIGN DataT{1}; \
return Ret; \
}

Expand All @@ -228,25 +218,24 @@ class vec_arith : public vec_arith_common<DataT, NumElements> {
friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP( \
const vec_t & Lhs, const vec_t & Rhs) { \
vec<ocl_t, NumElements> Ret{}; \
/* This special case is needed since there are no standard operator|| */ \
/* or operator&& functions for std::array. */ \
if constexpr (vec_t::IsUsingArrayOnDevice && \
(std::string_view(#RELLOGOP) == "||" || \
std::string_view(#RELLOGOP) == "&&")) { \
/* ext_vector_type does not support bfloat16, so for these */ \
/* we do element-by-element operation on the underlying std::array. */ \
if constexpr (vec_t::IsBfloat16) { \
for (size_t I = 0; I < NumElements; ++I) { \
/* We cannot use SetValue here as the operator is not a friend of*/ \
/* Ret on Windows. */ \
Ret[I] = static_cast<ocl_t>( \
-(vec_data<DataT>::get(detail::VecAccess<vec_t>::getValue(Lhs, I)) \
RELLOGOP vec_data<DataT>::get( \
detail::VecAccess<vec_t>::getValue(Rhs, I)))); \
Ret[I] = static_cast<ocl_t>(-(Lhs[I] RELLOGOP Rhs[I])); \
} \
} else { \
auto ExtVecLhs = sycl::bit_cast<typename vec_t::vector_t>(Lhs); \
auto ExtVecRhs = sycl::bit_cast<typename vec_t::vector_t>(Rhs); \
/* Cast required to convert unsigned char ext_vec_type to */ \
/* char ext_vec_type. */ \
Ret = vec<ocl_t, NumElements>( \
(typename vec<ocl_t, NumElements>::vector_t)( \
Lhs.m_Data RELLOGOP Rhs.m_Data)); \
if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/ \
ExtVecLhs RELLOGOP ExtVecRhs)); \
/* For NumElements == 1, we use scalar instead of ext_vector_type. */ \
if constexpr (NumElements == 1) { \
Ret *= -1; \
} \
} \
return Ret; \
}
Expand All @@ -257,12 +246,7 @@ class vec_arith : public vec_arith_common<DataT, NumElements> {
const vec_t & Lhs, const vec_t & Rhs) { \
vec<ocl_t, NumElements> Ret{}; \
for (size_t I = 0; I < NumElements; ++I) { \
/* We cannot use SetValue here as the operator is not a friend of*/ \
/* Ret on Windows. */ \
Ret[I] = static_cast<ocl_t>( \
-(vec_data<DataT>::get(detail::VecAccess<vec_t>::getValue(Lhs, I)) \
RELLOGOP vec_data<DataT>::get( \
detail::VecAccess<vec_t>::getValue(Rhs, I)))); \
Ret[I] = static_cast<ocl_t>(-(Lhs[I] RELLOGOP Rhs[I])); \
} \
return Ret; \
}
Expand Down Expand Up @@ -376,34 +360,36 @@ template <typename DataT, int NumElements> class vec_arith_common {
protected:
using vec_t = vec<DataT, NumElements>;

static constexpr bool IsBfloat16 =
std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>;

// operator~() available only when: dataT != float && dataT != double
// && dataT != half
template <typename T = DataT>
friend std::enable_if_t<!detail::is_vgenfloat_v<T>, vec_t>
operator~(const vec_t &Rhs) {
if constexpr (vec_t::IsUsingArrayOnDevice || vec_t::IsUsingArrayOnHost) {
vec_t Ret{};
for (size_t I = 0; I < NumElements; ++I) {
detail::VecAccess<vec_t>::setValue(
Ret, I, ~detail::VecAccess<vec_t>::getValue(Rhs, I));
}
return Ret;
} else {
vec_t Ret{(typename vec_t::DataType) ~Rhs.m_Data};
if constexpr (std::is_same_v<DataT, bool>) {
vec_arith_common<bool, NumElements>::ConvertToDataT(Ret);
}
return Ret;
#ifdef __SYCL_DEVICE_ONLY__
auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Rhs);
vec_t Ret{~extVec};
if constexpr (std::is_same_v<DataT, bool>) {
ConvertToDataT(Ret);
}
return Ret;
#else
vec_t Ret{};
for (size_t I = 0; I < NumElements; ++I) {
Ret[I] = ~Rhs[I];
}
return Ret;
#endif
}

#ifdef __SYCL_DEVICE_ONLY__
using vec_bool_t = vec<bool, NumElements>;
// Required only for std::bool.
static void ConvertToDataT(vec_bool_t &Ret) {
for (size_t I = 0; I < NumElements; ++I) {
DataT Tmp = detail::VecAccess<vec_bool_t>::getValue(Ret, I);
detail::VecAccess<vec_bool_t>::setValue(Ret, I, Tmp);
Ret[I] = bit_cast<int8_t>(Ret[I]) != 0;
}
}
#endif
Expand Down
9 changes: 9 additions & 0 deletions sycl/include/sycl/detail/vector_convert.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,15 @@ NativeToT convertImpl(NativeFromT Value) {
}
}

#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
template <typename FromT, typename ToT, sycl::rounding_mode RoundingMode,
int VecSize, typename NativeFromT, typename NativeToT>
auto ConvertImpl(std::byte val) {
return convertImpl<FromT, ToT, RoundingMode, VecSize, NativeFromT, NativeToT>(
(std::int8_t)val);
}
#endif

} // namespace detail
} // namespace _V1
} // namespace sycl
2 changes: 2 additions & 0 deletions sycl/include/sycl/ext/oneapi/bfloat16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ template <int N> void FloatVecToBF16Vec(float src[N], bfloat16 dst[N]) {

// sycl::vec support
namespace bf16 {
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
#ifdef __SYCL_DEVICE_ONLY__
using Vec2StorageT = Bfloat16StorageT __attribute__((ext_vector_type(2)));
using Vec3StorageT = Bfloat16StorageT __attribute__((ext_vector_type(3)));
Expand All @@ -115,6 +116,7 @@ using Vec4StorageT = std::array<Bfloat16StorageT, 4>;
using Vec8StorageT = std::array<Bfloat16StorageT, 8>;
using Vec16StorageT = std::array<Bfloat16StorageT, 16>;
#endif
#endif // __INTEL_PREVIEW_BREAKING_CHANGES
} // namespace bf16
} // namespace detail

Expand Down
6 changes: 6 additions & 0 deletions sycl/include/sycl/half_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,18 +249,22 @@ using StorageT = _Float16;
using BIsRepresentationT = _Float16;
using VecElemT = _Float16;

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
using Vec2StorageT = VecElemT __attribute__((ext_vector_type(2)));
using Vec3StorageT = VecElemT __attribute__((ext_vector_type(3)));
using Vec4StorageT = VecElemT __attribute__((ext_vector_type(4)));
using Vec8StorageT = VecElemT __attribute__((ext_vector_type(8)));
using Vec16StorageT = VecElemT __attribute__((ext_vector_type(16)));
#endif // __INTEL_PREVIEW_BREAKING_CHANGES

#else // SYCL_DEVICE_ONLY
using StorageT = detail::host_half_impl::half;
// No need to extract underlying data type for built-in functions operating on
// host
using BIsRepresentationT = half;
using VecElemT = half;

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
// On the host side we cannot use OpenCL cl_half# types as an underlying type
// for vec because they are actually defined as an integer type under the
// hood. As a result half values will be converted to the integer and passed
Expand All @@ -270,6 +274,8 @@ using Vec3StorageT = std::array<VecElemT, 3>;
using Vec4StorageT = std::array<VecElemT, 4>;
using Vec8StorageT = std::array<VecElemT, 8>;
using Vec16StorageT = std::array<VecElemT, 16>;
#endif // __INTEL_PREVIEW_BREAKING_CHANGES

#endif // SYCL_DEVICE_ONLY

#ifndef __SYCL_DEVICE_ONLY__
Expand Down
Loading

0 comments on commit e7defab

Please sign in to comment.