Merge branch 'sycl-upstream' into maxime/profiling_v2

intel · Feb 14, 2024 · f52745e · f52745e
2 parents 1edcd01 + 24ce45c
commit f52745e
Show file tree

Hide file tree

Showing 190 changed files with 1,476 additions and 629 deletions.
diff --git a/libclc/ptx-nvidiacl/libspirv/images/image.cl b/libclc/ptx-nvidiacl/libspirv/images/image.cl
diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp
@@ -405,9 +405,7 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
   assert(T && "Unexpected type");
 
   if (T->isAggregateType()) {
-    // TODO: we should use methods which directly return MaybeAlign once such
-    // are added to LLVM for AllocaInst and GlobalVariable
-    auto ShdAlign = MaybeAlign(Shadow->getAlignment());
+    auto ShdAlign = Shadow->getAlign();
     Module &M = *Shadow->getParent();
     auto SizeVal = M.getDataLayout().getTypeStoreSize(T);
     auto Size = ConstantInt::get(getSizeTTy(M), SizeVal);

diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -445,8 +445,8 @@ class NVPTXAMDGCNTargetFusionInfoBase : public TargetFusionInfoImpl {
 public:
   using TargetFusionInfoImpl::TargetFusionInfoImpl;
 
-  void notifyFunctionsDelete(StringRef MDName,
-                             llvm::ArrayRef<Function *> Funcs) const {
+  void removeDeletedKernelsFromMD(StringRef MDName,
+                                  llvm::ArrayRef<Function *> Funcs) const {
     SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
     SmallVector<MDNode *> ValidKernels;
     auto *OldAnnotations = LLVMMod->getNamedMetadata(MDName);
@@ -469,7 +469,7 @@ class NVPTXAMDGCNTargetFusionInfoBase : public TargetFusionInfoImpl {
     }
   }
 
-  void addKernelFunction(StringRef MDName, Function *KernelFunc) const {
+  void addKernelToMD(StringRef MDName, Function *KernelFunc) const {
     auto *Annotations = LLVMMod->getOrInsertNamedMetadata(MDName);
     auto *MDOne = ConstantAsMetadata::get(
         ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
@@ -707,13 +707,11 @@ class NVPTXTargetFusionInfo final : public NVPTXAMDGCNTargetFusionInfoBase {
   using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase;
 
   void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
-    NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("nvvm.annotations",
-                                                           Funcs);
+    removeDeletedKernelsFromMD("nvvm.annotations", Funcs);
   }
 
   void addKernelFunction(Function *KernelFunc) const override {
-    NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("nvvm.annotations",
-                                                       KernelFunc);
+    addKernelToMD("nvvm.annotations", KernelFunc);
   }
 
   void createBarrierCall(IRBuilderBase &Builder,
@@ -818,14 +816,12 @@ class AMDGCNTargetFusionInfo final : public NVPTXAMDGCNTargetFusionInfoBase {
   using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase;
 
   void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
-    NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("amdgcn.annotations",
-                                                           Funcs);
+    removeDeletedKernelsFromMD("amdgcn.annotations", Funcs);
   }
 
   void addKernelFunction(Function *KernelFunc) const override {
     KernelFunc->setCallingConv(CallingConv::AMDGPU_KERNEL);
-    NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("amdgcn.annotations",
-                                                       KernelFunc);
+    addKernelToMD("amdgcn.annotations", KernelFunc);
   }
 
   void createBarrierCall(IRBuilderBase &Builder,

diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
@@ -226,18 +226,15 @@ This remapping consists on an inter-procedural pass replacing each built-in quer
 First of all, work-item remapping will always be performed when the list of input nd-ranges is heterogeneous. Additional remapping conditions are present for the following work-item components. For each input kernel:
 
 - `num_work_groups` and `local_size`: Only performed if the input nd-range has an explicit local size, may result in better performance, as this replaces built-in calls with constants;
-- `global_id`, `local_id` and `group_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
+- `global_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
+- `local_id` and `group_id`: Never needed as per [kernel fusion restrictions](#restrictions). These are invariant after fusion.
 
 Once this rules are set, also taking into account remapping constraints, the remapping is performed as follows for each input kernel:
 
 - `global_id`:
   - `global_id(0) = GLID / (global_size(1) * global_size(2))`
   - `global_id(1) = (GLID / global_size(2)) % global_size(1)`
   - `global_id(2) = GLID % global_size(2)`
-- `local_id`:
-  - `local_id(x) = global_id(x) % local_size(x)`
-- `group_id`:
-  - `group_id(x) = global_id(x) / local_size(x)`
 - `num_work_groups`:
   - `num_work_groups(x) = global_size(x) / local_size(x)`
 - `global_size`:
@@ -348,6 +345,13 @@ q.submit([&](sycl::handler &cgh) {
       sycl::detail::strategy::group_reduce_and_last_wg_detection>(...);
 });
 ```
+### Group Algorithms and Functions
+
+Kernel fusion supports group algorithms and functions. As per [remapping
+rules](#work-item-remapping), group ID and local ID are invariant after fusion
+even when different ND-ranges are involved. This way, group functions and
+algorithms conceptually executed for a given group and using a given local ID
+as, e.g., the `group_broadcast` local ID, will keep semantics after fusion.
 
 ### Unsupported SYCL constructs
 

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc
@@ -1030,15 +1030,11 @@ of the <<recognized_standard_types>>.
 Sampled images cannot be written to using `write_image`.
 
 For reading and writing of unsampled images, coordinates are specified by `int`, 
-`sycl::vec<int, 2>`, and `sycl::vec<int, 4>` for 1D, 2D, and 3D images, 
+`sycl::vec<int, 2>`, and `sycl::vec<int, 3>` for 1D, 2D, and 3D images, 
 respectively.
 
 Sampled image reads take `float`, `sycl::vec<float, 2>`, and 
-`sycl::vec<float, 4>` coordinate types for 1D, 2D, and 3D images, respectively.
-
-Note that in the case of 3D reads or writes, coordinates for 3D images take a 
-vector of size 4, not 3, as the fourth element in the coordinate vector is 
-ignored.
+`sycl::vec<float, 3>` coordinate types for 1D, 2D, and 3D images, respectively.
 
 Note also that all images must be used in either read-only or write-only fashion 
 within a single kernel invocation; read/write images are not supported.
@@ -1061,7 +1057,7 @@ standard types.
 
 * All POD types (`char`, `short`, `int`, `float`, etc.) excluding `double`
 * `sycl::half`
-* Variants of `sycl::vec<T, N>` where `T` is one of the above, and `N` is `1`, `2`, or `4`
+* Variants of `sycl::vec<T, N>` where `T` is one of the above, and `N` is `1`, `2`, or `3`
 
 Any other types are classified as user-defined types.
 
@@ -1080,7 +1076,7 @@ struct my_short2 {
 ```
 
 When providing the above types as `DataT` parameters to an image read function, 
-the corresponding `HintT` parameters to use would be `sycl::vec<float, 4>` and 
+the corresponding `HintT` parameters to use would be `sycl::vec<float, 4>` and
 `sycl::vec<short, 2>`, respectively.
 
 == Mipmapped images

diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp
@@ -170,16 +170,22 @@ auto builtin_default_host_impl(FuncTy F, const Ts &...x) {
 template <typename FuncTy, typename... Ts>
 auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) {
   using T = typename first_type<Ts...>::type;
-  if constexpr (is_vec_or_swizzle_v<T>) {
-    using ret_elem_type = decltype(F(x[0]...));
-    // TODO: using r{} to avoid Werror. Not sure if ok.
-    vec<ret_elem_type, T::size()> r{};
-    loop<T::size()>([&](auto idx) { r[idx] = F(x[idx]...); });
-    return r;
+  static_assert(is_vec_or_swizzle_v<T> || is_marray_v<T>);
+
+  constexpr auto Size = T::size();
+  using ret_elem_type = decltype(F(x[0]...));
+  std::conditional_t<is_marray_v<T>, marray<ret_elem_type, Size>,
+                     vec<ret_elem_type, Size>>
+      r{};
+
+  if constexpr (is_marray_v<T>) {
+    for (size_t i = 0; i < Size; ++i)
+      r[i] = F(x[i]...);
   } else {
-    static_assert(is_marray_v<T>);
-    return builtin_marray_impl(F, x...);
+    loop<Size>([&](auto idx) { r[idx] = F(x[idx]...); });
   }
+
+  return r;
 }
 
 template <typename T>

diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp
@@ -663,6 +663,14 @@ template <> struct ConvertToOpenCLTypeImpl<Boolean<1>> {
   // Or should it be "int"?
   using type = Boolean<1>;
 };
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+// TODO: It seems we only use this to convert a pointer's element type. As such,
+// although it doesn't look very clean, it should be ok having this case handled
+// explicitly until further refactoring of this area.
+template <> struct ConvertToOpenCLTypeImpl<std::byte> {
+  using type = uint8_t;
+};
+#endif
 #endif
 
 template <typename T> struct ConvertToOpenCLTypeImpl<T *> {
@@ -700,8 +708,30 @@ convertDataToType(FROM t) {
 // Now fuse the above into a simpler helper that's easy to use.
 // TODO: That should probably be moved outside of "type_traits".
 template <typename T> auto convertToOpenCLType(T &&x) {
-  using OpenCLType = ConvertToOpenCLType_t<std::remove_reference_t<T>>;
-  return convertDataToType<T, OpenCLType>(std::forward<T>(x));
+  using no_ref = std::remove_reference_t<T>;
+  if constexpr (is_multi_ptr_v<no_ref>) {
+    return convertToOpenCLType(x.get_decorated());
+  } else if constexpr (std::is_pointer_v<no_ref>) {
+    // TODO: Below ignores volatile, but we didn't have a need for it yet.
+    using elem_type = remove_decoration_t<std::remove_pointer_t<no_ref>>;
+    using converted_elem_type_no_cv =
+        ConvertToOpenCLType_t<std::remove_const_t<elem_type>>;
+    using converted_elem_type =
+        std::conditional_t<std::is_const_v<elem_type>,
+                           const converted_elem_type_no_cv,
+                           converted_elem_type_no_cv>;
+#ifdef __SYCL_DEVICE_ONLY__
+    using result_type =
+        typename DecoratedType<converted_elem_type,
+                               deduce_AS<no_ref>::value>::type *;
+#else
+    using result_type = converted_elem_type *;
+#endif
+    return reinterpret_cast<result_type>(x);
+  } else {
+    using OpenCLType = ConvertToOpenCLType_t<no_ref>;
+    return convertDataToType<T, OpenCLType>(std::forward<T>(x));
+  }
 }
 
 template <typename To, typename From> auto convertFromOpenCLTypeFor(From &&x) {