intel · againull · Jun 21, 2024 · Apr 18, 2024 · May 28, 2024 · May 31, 2024
@@ -17,6 +17,8 @@
 #include <sycl/half_type.hpp>                 // for BIsRepresentationT
 #include <sycl/multi_ptr.hpp>                 // for multi_ptr, address_spa...
 
+#include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16 storage type.
+
 #include <cstddef>     // for byte
 #include <cstdint>     // for uint8_t
 #include <limits>      // for numeric_limits
@@ -386,7 +388,13 @@ template <typename T> auto convertToOpenCLType(T &&x) {
     static_assert(sizeof(OpenCLType) == sizeof(T));
     return static_cast<OpenCLType>(x);
   } else if constexpr (is_bfloat16_v<no_ref>) {
+    // On host, don't interpret BF16 as uint16.
+#ifdef __SYCL_DEVICE_ONLY__
+    using OpenCLType = sycl::ext::oneapi::detail::Bfloat16StorageT;
+    return sycl::bit_cast<OpenCLType>(x);
+#else
     return std::forward<T>(x);
+#endif
   } else if constexpr (std::is_floating_point_v<no_ref>) {
     static_assert(std::is_same_v<no_ref, float> ||
                       std::is_same_v<no_ref, double>,

@@ -57,6 +57,8 @@
 #include <sycl/detail/generic_type_traits.hpp> // for is_sigeninteger, is_s...
 #include <sycl/exception.hpp>                  // for errc
 
+#include <sycl/ext/oneapi/bfloat16.hpp> // bfloat16
+
 #ifndef __SYCL_DEVICE_ONLY__
 #include <cfenv> // for fesetround, fegetround
 #endif
@@ -123,6 +125,15 @@ using is_float_to_float =
     std::bool_constant<detail::is_floating_point<T>::value &&
                        detail::is_floating_point<R>::value>;
 
+using bfloat16 = sycl::ext::oneapi::bfloat16;
+template <typename T, typename R>
+using is_bf16_to_float =
+    std::bool_constant<std::is_same_v<T, bfloat16> && std::is_same_v<R, float>>;
+
+template <typename T, typename R>
+using is_float_to_bf16 =
+    std::bool_constant<std::is_same_v<R, bfloat16> && std::is_same_v<T, float>>;
+
 #ifndef __SYCL_DEVICE_ONLY__
 template <typename From, typename To, int VecSize,
           typename Enable = std::enable_if_t<VecSize == 1>>
@@ -196,8 +207,27 @@ template <typename From, typename To, int VecSize,
 To ConvertFToU(From Value) {
   return ConvertFToS<From, To, VecSize, Enable, roundingMode>(Value);
 }
-#else
 
+template <typename NativeBFT, typename NativeFloatT, int VecSize>
+inline NativeFloatT ConvertBF16ToF(NativeBFT val) {
+  static_assert(VecSize == 1);
+  // On host, ensure that we don't convert BF16 to uint16 for conversion.
+  static_assert(std::is_same_v<NativeBFT, sycl::ext::oneapi::bfloat16>);
+
+  return (NativeFloatT)val;
+}
+
+// Create a bfloat16 from float.
+template <typename NativeFloatT, typename NativeBFT, int VecSize>
+inline NativeBFT ConvertFToBF16(NativeFloatT val) {
+  static_assert(VecSize == 1);
+  // On host, ensure that we don't convert BF16 to uint16 for conversion.
+  static_assert(std::is_same_v<NativeBFT, sycl::ext::oneapi::bfloat16>);
+
+  return NativeBFT(val);
+}
+
+#else
 // Bunch of helpers to "specialize" each template for its own destination type
 // and vector size.
 
@@ -498,6 +528,51 @@ __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE(double)
 #undef __SYCL_FLOAT_FLOAT_CONVERT
 #undef __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE
 
+template <typename NativeBFT, typename NativeFloatT, int VecSize>
+inline NativeFloatT ConvertBF16ToF(NativeBFT vec) {
+  if constexpr (VecSize == 1) {
+    // On device, we interpret bfloat16 as a uint16_t scalar or vector.
+    static_assert(
+        std::is_same_v<NativeBFT, sycl::ext::oneapi::detail::Bfloat16StorageT>);
+
+    // Bitcast to BF16 and typecast to float.
+    bfloat16 convertedBF = sycl::bit_cast<bfloat16>(vec);
+    return (float)convertedBF;
+  } else {
+    bfloat16 *src = sycl::bit_cast<bfloat16 *>(&vec);
+
+    // OpenCL vector of 3 elements is aligned to 4 multiplied by
+    // the size of data type.
+    constexpr int AdjustedSize = (VecSize == 3) ? 4 : VecSize;
+    float dst[AdjustedSize];
+    sycl::ext::oneapi::detail::BF16VecToFloatVec<VecSize>(src, dst);
+
+    return sycl::bit_cast<NativeFloatT>(dst);
+  }
+}
+
+template <typename NativeFloatT, typename NativeBFT, int VecSize>
+inline NativeBFT ConvertFToBF16(NativeFloatT vec) {
+  if constexpr (VecSize == 1) {
+    // On device, we interpret bfloat16 as a uint16_t scalar or vector.
+    static_assert(
+        std::is_same_v<NativeBFT, sycl::ext::oneapi::detail::Bfloat16StorageT>);
+
+    auto bf16Val = bfloat16(vec);
+    return sycl::bit_cast<NativeBFT>(bf16Val);
+  } else {
+    float *src = sycl::bit_cast<float *>(&vec);
+
+    // OpenCL vector of 3 elements is aligned to 4 multiplied by
+    // the size of data type.
+    constexpr int AdjustedSize = (VecSize == 3) ? 4 : VecSize;
+    bfloat16 dst[AdjustedSize];
+
+    sycl::ext::oneapi::detail::FloatVecToBF16Vec<VecSize>(src, dst);
+    return sycl::bit_cast<NativeBFT>(dst);
+  }
+}
+
 #endif // __SYCL_DEVICE_ONLY__
 
 /// Entry point helper for all kinds of converts between scalars and vectors, it
@@ -537,6 +612,10 @@ NativeToT convertImpl(NativeFromT Value) {
   else if constexpr (is_float_to_float<FromT, ToT>::value)
     return FConvert<NativeFromT, NativeToT, VecSize, ElemTy, RoundingMode>(
         Value);
+  else if constexpr (is_bf16_to_float<FromT, ToT>::value)
+    return ConvertBF16ToF<NativeFromT, NativeToT, VecSize>(Value);
+  else if constexpr (is_float_to_bf16<FromT, ToT>::value)
+    return ConvertFToBF16<NativeFromT, NativeToT, VecSize>(Value);
   else if constexpr (is_float_to_sint<FromT, ToT>::value)
     return ConvertFToS<NativeFromT, NativeToT, VecSize, ElemTy, RoundingMode>(
         Value);

@@ -786,9 +786,25 @@ template <typename Type, int NumElements> class vec {
                           detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
       vec<convertT, NumElements>>
   convert() const {
+    using bfloat16 = sycl::ext::oneapi::bfloat16;
     static_assert(std::is_integral_v<vec_data_t<convertT>> ||
-                      detail::is_floating_point<convertT>::value,
+                      detail::is_floating_point<convertT>::value ||
+                      // Conversion to BF16 available only for float.
+                      (std::is_same_v<convertT, bfloat16> &&
+                       std::is_same_v<DataT, float>),
                   "Unsupported convertT");
+
+    // Currently, for BF16 <--> float conversion, we only support
+    // Round-to-even rounding mode.
+    constexpr bool isFloatToBF16Conv =
+        std::is_same_v<convertT, bfloat16> && std::is_same_v<DataT, float>;
+    constexpr bool isBF16ToFloatConv =
+        std::is_same_v<DataT, bfloat16> && std::is_same_v<convertT, float>;
+    if constexpr (isFloatToBF16Conv || isBF16ToFloatConv) {
+      static_assert(roundingMode == rounding_mode::automatic ||
+                    roundingMode == rounding_mode::rte);
+    }
+
     using T = vec_data_t<DataT>;
     using R = vec_data_t<convertT>;
     using OpenCLT = detail::ConvertToOpenCLType_t<T>;
@@ -828,10 +844,19 @@ template <typename Type, int NumElements> class vec {
     {
       // Otherwise, we fallback to per-element conversion:
       for (size_t I = 0; I < NumElements; ++I) {
-        Result.setValue(
-            I, vec_data<convertT>::get(
-                   detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
-                       vec_data<DataT>::get(getValue(I)))));
+        // For float -> bf16.
+        if constexpr (isFloatToBF16Conv) {
+          Result[I] = bfloat16((*this)[I]);
+        } else
+          // For bf16 -> float.
+          if constexpr (isBF16ToFloatConv) {
+            Result[I] = (float)((*this)[I]);
+          } else {
+            Result.setValue(I, vec_data<convertT>::get(
+                                   detail::convertImpl<T, R, roundingMode, 1,
+                                                       OpenCLT, OpenCLR>(
+                                       vec_data<DataT>::get(getValue(I)))));
+          }
       }
     }
 

@@ -419,9 +419,25 @@ class vec : public detail::vec_arith<DataT, NumElements> {
 
     using T = ConvertBoolAndByteT<DataT>;
     using R = ConvertBoolAndByteT<convertT>;
-    static_assert(std::is_integral_v<R> || detail::is_floating_point<R>::value,
+    using bfloat16 = sycl::ext::oneapi::bfloat16;
+    static_assert(std::is_integral_v<R> ||
+                      detail::is_floating_point<R>::value ||
+                      std::is_same_v<R, bfloat16>,
                   "Unsupported convertT");
 
+    {
+      // Currently, for BF16 <--> float conversion, we only support
+      // Round-to-even rounding mode.
+      constexpr bool isFloatToBF16Conv =
+          std::is_same_v<convertT, bfloat16> && std::is_same_v<DataT, float>;
+      constexpr bool isBF16ToFloatConv =
+          std::is_same_v<DataT, bfloat16> && std::is_same_v<convertT, float>;
+      if constexpr (isFloatToBF16Conv || isBF16ToFloatConv) {
+        static_assert(roundingMode == rounding_mode::automatic ||
+                      roundingMode == rounding_mode::rte);
+      }
+    }
+
     using OpenCLT = detail::ConvertToOpenCLType_t<T>;
     using OpenCLR = detail::ConvertToOpenCLType_t<R>;
     vec<convertT, NumElements> Result;
@@ -479,11 +495,16 @@ class vec : public detail::vec_arith<DataT, NumElements> {
           auto val =
               detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
                   getValue(I));
-          Result[I] = static_cast<convertT>(val);
+#ifdef __SYCL_DEVICE_ONLY__
+          // On device, we interpret BF16 as uint16.
+          if constexpr (std::is_same_v<convertT, bfloat16>)
+            Result[I] = sycl::bit_cast<convertT>(val);
+          else
+#endif
+            Result[I] = static_cast<convertT>(val);
         }
       }
     }
-
     return Result;
   }
 

@@ -16,11 +16,12 @@
 // RUN: %if preview-breaking-changes-supported %{ %{run} %t2.out  %}
 
 #include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/stream.hpp>
 
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
 constexpr unsigned N =
-    10; // init plus arithmetic + - * /   for vec<1> and vec<2>
+    14; // init plus arithmetic + - * / plus convert for vec<1> and vec<2>
 
 int main() {
 
@@ -46,17 +47,26 @@ int main() {
     sycl::vec<T, 1>  simple_multiplication = oneA * oneB;
     sycl::vec<T, 1>  simple_division = oneA / oneB;
 
+    // Test bf16 to float vec conversion on host
+    sycl::vec<float, 1> fConv = init_float.template convert<float>();
+    // Test float to bf16 vec conversion on host
+    sycl::vec<T, 1> brev = fConv.template convert<T>();
+
     std::cout << "iniitialization     : " << oneA[0]             << " float: " << init_float[0] << std::endl;
     std::cout << "addition.        ref: " << addition_ref0       << " vec: " << simple_addition[0] << std::endl;
     std::cout << "subtraction.     ref: " << subtraction_ref0    << " vec: " << simple_subtraction[0] << std::endl;
     std::cout << "multiplication.  ref: " << multiplication_ref0 << " vec: " << simple_multiplication[0] << std::endl;
     std::cout << "division.        ref: " << division_ref0       << " vec: " << simple_division[0] << std::endl;
+    std::cout << "float conv.      ref: " << (float)init_float[0]<< " vec: " << fConv[0] << std::endl;
+    std::cout << "bf16 conv.       ref: " << init_float[0]       << " vec: " << brev[0] << std::endl;
 
     assert(oneA[0] == init_float[0]);
     assert(addition_ref0 == simple_addition[0]);
     assert(subtraction_ref0 == simple_subtraction[0]);
     assert(multiplication_ref0 == simple_multiplication[0]);
     assert(division_ref0 == simple_division[0]);
+    assert((float)init_float[0] == fConv[0]);
+    assert(brev[0] == init_float[0]);
 
     std::cout << " ---  ON DEVICE --- " << std::endl;
     sycl::range<1> r(N);
@@ -72,17 +82,26 @@ int main() {
             sycl::vec<T, 1>  device_multiplication = oneA * oneB;
             sycl::vec<T, 1>  device_division = oneA / oneB;
 
+            // Test bf16 to float vec conversion on host
+            sycl::vec<float, 1> fConv = dev_float.template convert<float>();
+            // Test float to bf16 vec conversion on host
+            sycl::vec<T, 1> brev = fConv.template convert<T>();
+
             out << "iniitialization     : " << oneA[0]             << " float: " << dev_float[0] << sycl::endl;
             out << "addition.        ref: " << addition_ref0       << " vec: " << device_addition[0] << sycl::endl;
             out << "subtraction.     ref: " << subtraction_ref0    << " vec: " << device_subtraction[0] << sycl::endl;
             out << "multiplication.  ref: " << multiplication_ref0 << " vec: " << device_multiplication[0] << sycl::endl;
             out << "division.        ref: " << division_ref0       << " vec: " << device_division[0] << sycl::endl;
+            out << "float conv.      ref: " << (float)dev_float[0] << " vec: " << fConv[0] << sycl::endl;
+            out << "bf16 conv.       ref: " << dev_float[0]        << " vec: " << brev[0] << sycl::endl;
 
             acc[0] = (oneA[0] == dev_float[0]);
             acc[1] = (addition_ref0 == device_addition[0]);
             acc[2] = (subtraction_ref0 == device_subtraction[0]);
             acc[3] = (multiplication_ref0 == device_multiplication[0]);
             acc[4] = (division_ref0 == device_division[0]);
+            acc[5] = ((float)dev_float[0] == fConv[0]);
+            acc[6] = (brev[0] == dev_float[0]);
 
         }); 
     }).wait();
@@ -105,6 +124,11 @@ int main() {
     sycl::vec<T, 2> double_multiplication = twoA * twoB;
     sycl::vec<T, 2> double_division = twoA / twoB;
 
+    // Test bf16 to float vec conversion on host
+    sycl::vec<float, 2> fConv2 = double_float.template convert<float>();
+    // Test float to bf16 vec conversion on host
+    sycl::vec<T, 2> brev2 = fConv2.template convert<T>();
+
     std::cout << "init ref: " << twoA[0]                << "    ref1: " << twoA[1] << std::endl;
     std::cout << "  float0: " << double_float[0]        << "  float1: " << double_float[1] << std::endl;
     std::cout << "+ ref0: " << addition_ref0            << "    ref1: " << addition_ref1 << std::endl;
@@ -115,13 +139,18 @@ int main() {
     std::cout << "mul[0]: " << double_multiplication[0] << "  mul[1]: " << double_multiplication[1] << std::endl;
     std::cout << "/ ref0: " << division_ref0            << "    ref1: " << division_ref1 << std::endl;
     std::cout << "div[0]: " << double_division[0]       << "  div[1]: " << double_division[1] << std::endl;
-
+    std::cout << "Float convert ref0: " << double_float[0]    << "    ref1: " << double_float[1] << std::endl;
+    std::cout << "convert[0]: " << fConv2[0]            << "  convert[1]: " << fConv2[1] << std::endl;
+    std::cout << "bf16 convert[0]: " << brev2[0]        << "  bf16 convert[1]: " << brev2[1] << std::endl;
+
     assert(twoA[0] == double_float[0]);                      assert(twoA[1] == double_float[1]);
     assert(addition_ref0 == double_addition[0]);             assert(addition_ref1 == double_addition[1]);
     assert(subtraction_ref0 == double_subtraction[0]);       assert(subtraction_ref1 == double_subtraction[1]);
     assert(multiplication_ref0 == double_multiplication[0]); assert(multiplication_ref1 == double_multiplication[1]);
     assert(division_ref0 == double_division[0]);             assert(division_ref1 == double_division[1]);
-
+    assert(fConv2[0] == (float)double_float[0]);             assert(fConv2[1] == (float)double_float[1]);
+    assert(brev2[0] == double_float[0]);                     assert(brev2[1] == double_float[1]);
+
     std::cout << " ---  ON DEVICE --- " << std::endl;
     q.submit([&](sycl::handler &cgh) {
         sycl::stream out(2024, 400, cgh);
@@ -133,6 +162,11 @@ int main() {
             sycl::vec<T, 2> device_multiplication = twoA * twoB;
             sycl::vec<T, 2> device_division = twoA / twoB;
 
+            // Test bf16 to float vec conversion on host
+            sycl::vec<float, 2> fConv2 = device_float.template convert<float>();
+            // Test float to bf16 vec conversion on host
+            sycl::vec<T, 2> brev2 = fConv2.template convert<T>();
+
             out << "init ref: " << twoA[0]                << "    ref1: " << twoA[1] << sycl::endl;
             out << "  float0: " << device_float[0]        << "  float1: " << device_float[1] << sycl::endl;
             out << "+ ref0: " << addition_ref0            << "    ref1: " << addition_ref1 << sycl::endl;
@@ -143,21 +177,26 @@ int main() {
             out << "mul[0]: " << device_multiplication[0] << "  mul[1]: " << device_multiplication[1] << sycl::endl;
             out << "/ ref0: " << division_ref0            << "    ref1: " << division_ref1 << sycl::endl;
             out << "div[0]: " << device_division[0]       << "  div[1]: " << device_division[1] << sycl::endl;
-
-            acc[5] = (twoA[0] == device_float[0]) && (twoA[1] == device_float[1]);
-            acc[6] = (addition_ref0 == device_addition[0]) && (addition_ref1 == device_addition[1]);
-            acc[7] = (subtraction_ref0 == device_subtraction[0]) && (subtraction_ref1 == device_subtraction[1]);
-            acc[8] = (multiplication_ref0 == device_multiplication[0]) && (multiplication_ref1 == device_multiplication[1]);
-            acc[9] = (division_ref0 == device_division[0]) && (division_ref1 == device_division[1]);
-
+            out << "Float convert ref0: " << device_float[0]    << "    ref1: " << device_float[1] << sycl::endl;
+            out << "convert[0]: " << fConv2[0]            << "  convert[1]: " << fConv2[1] << sycl::endl;
+            out << "bf16 convert[0]: " << brev2[0]        << "  bf16 convert[1]: " << brev2[1] << sycl::endl;
+
+            acc[7] = (twoA[0] == device_float[0]) && (twoA[1] == device_float[1]);
+            acc[8] = (addition_ref0 == device_addition[0]) && (addition_ref1 == device_addition[1]);
+            acc[9] = (subtraction_ref0 == device_subtraction[0]) && (subtraction_ref1 == device_subtraction[1]);
+            acc[10] = (multiplication_ref0 == device_multiplication[0]) && (multiplication_ref1 == device_multiplication[1]);
+            acc[11] = (division_ref0 == device_division[0]) && (division_ref1 == device_division[1]);
+            acc[12] = (fConv2[0] == (float)device_float[0]) && (fConv2[1] == (float)device_float[1]);
+            acc[13] = (brev2[0] == device_float[0]) && (brev2[1] == device_float[1]);
         }); 
     }).wait();
+    // clang-format on
 
     sycl::host_accessor h_acc(buf, sycl::read_only);
-    for(unsigned i = 0; i < N; i++){
-        assert(h_acc[i]);
+    for (unsigned i = 0; i < N; i++) {
+      assert(h_acc[i]);
     }
 
-  // clang-format on
-  return 0;
+    std::cout << "Test Passed." << std::endl;
+    return 0;
 }