Skip to content

Commit

Permalink
Merge branch 'sycl-upstream' into maxime/profiling_v2
Browse files Browse the repository at this point in the history
  • Loading branch information
mfrancepillois committed Feb 14, 2024
2 parents 1edcd01 + 24ce45c commit f52745e
Show file tree
Hide file tree
Showing 190 changed files with 1,476 additions and 629 deletions.
194 changes: 98 additions & 96 deletions libclc/ptx-nvidiacl/libspirv/images/image.cl

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,9 +405,7 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
assert(T && "Unexpected type");

if (T->isAggregateType()) {
// TODO: we should use methods which directly return MaybeAlign once such
// are added to LLVM for AllocaInst and GlobalVariable
auto ShdAlign = MaybeAlign(Shadow->getAlignment());
auto ShdAlign = Shadow->getAlign();
Module &M = *Shadow->getParent();
auto SizeVal = M.getDataLayout().getTypeStoreSize(T);
auto Size = ConstantInt::get(getSizeTTy(M), SizeVal);
Expand Down
18 changes: 7 additions & 11 deletions sycl-fusion/passes/target/TargetFusionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,8 @@ class NVPTXAMDGCNTargetFusionInfoBase : public TargetFusionInfoImpl {
public:
using TargetFusionInfoImpl::TargetFusionInfoImpl;

void notifyFunctionsDelete(StringRef MDName,
llvm::ArrayRef<Function *> Funcs) const {
void removeDeletedKernelsFromMD(StringRef MDName,
llvm::ArrayRef<Function *> Funcs) const {
SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
SmallVector<MDNode *> ValidKernels;
auto *OldAnnotations = LLVMMod->getNamedMetadata(MDName);
Expand All @@ -469,7 +469,7 @@ class NVPTXAMDGCNTargetFusionInfoBase : public TargetFusionInfoImpl {
}
}

void addKernelFunction(StringRef MDName, Function *KernelFunc) const {
void addKernelToMD(StringRef MDName, Function *KernelFunc) const {
auto *Annotations = LLVMMod->getOrInsertNamedMetadata(MDName);
auto *MDOne = ConstantAsMetadata::get(
ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
Expand Down Expand Up @@ -707,13 +707,11 @@ class NVPTXTargetFusionInfo final : public NVPTXAMDGCNTargetFusionInfoBase {
using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase;

void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("nvvm.annotations",
Funcs);
removeDeletedKernelsFromMD("nvvm.annotations", Funcs);
}

void addKernelFunction(Function *KernelFunc) const override {
NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("nvvm.annotations",
KernelFunc);
addKernelToMD("nvvm.annotations", KernelFunc);
}

void createBarrierCall(IRBuilderBase &Builder,
Expand Down Expand Up @@ -818,14 +816,12 @@ class AMDGCNTargetFusionInfo final : public NVPTXAMDGCNTargetFusionInfoBase {
using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase;

void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("amdgcn.annotations",
Funcs);
removeDeletedKernelsFromMD("amdgcn.annotations", Funcs);
}

void addKernelFunction(Function *KernelFunc) const override {
KernelFunc->setCallingConv(CallingConv::AMDGPU_KERNEL);
NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("amdgcn.annotations",
KernelFunc);
addKernelToMD("amdgcn.annotations", KernelFunc);
}

void createBarrierCall(IRBuilderBase &Builder,
Expand Down
14 changes: 9 additions & 5 deletions sycl/doc/design/KernelFusionJIT.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,18 +226,15 @@ This remapping consists on an inter-procedural pass replacing each built-in quer
First of all, work-item remapping will always be performed when the list of input nd-ranges is heterogeneous. Additional remapping conditions are present for the following work-item components. For each input kernel:

- `num_work_groups` and `local_size`: Only performed if the input nd-range has an explicit local size, may result in better performance, as this replaces built-in calls with constants;
- `global_id`, `local_id` and `group_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
- `global_id`: Only needed if the number of dimensions differ w.r.t. that of the fused kernel or any component of the global size in the range [2, `num_dims`] differs.
- `local_id` and `group_id`: Never needed as per [kernel fusion restrictions](#restrictions). These are invariant after fusion.

Once this rules are set, also taking into account remapping constraints, the remapping is performed as follows for each input kernel:

- `global_id`:
- `global_id(0) = GLID / (global_size(1) * global_size(2))`
- `global_id(1) = (GLID / global_size(2)) % global_size(1)`
- `global_id(2) = GLID % global_size(2)`
- `local_id`:
- `local_id(x) = global_id(x) % local_size(x)`
- `group_id`:
- `group_id(x) = global_id(x) / local_size(x)`
- `num_work_groups`:
- `num_work_groups(x) = global_size(x) / local_size(x)`
- `global_size`:
Expand Down Expand Up @@ -348,6 +345,13 @@ q.submit([&](sycl::handler &cgh) {
sycl::detail::strategy::group_reduce_and_last_wg_detection>(...);
});
```
### Group Algorithms and Functions

Kernel fusion supports group algorithms and functions. As per [remapping
rules](#work-item-remapping), group ID and local ID are invariant after fusion
even when different ND-ranges are involved. This way, group functions and
algorithms conceptually executed for a given group and using a given local ID
as, e.g., the `group_broadcast` local ID, will keep semantics after fusion.

### Unsupported SYCL constructs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1030,15 +1030,11 @@ of the <<recognized_standard_types>>.
Sampled images cannot be written to using `write_image`.

For reading and writing of unsampled images, coordinates are specified by `int`,
`sycl::vec<int, 2>`, and `sycl::vec<int, 4>` for 1D, 2D, and 3D images,
`sycl::vec<int, 2>`, and `sycl::vec<int, 3>` for 1D, 2D, and 3D images,
respectively.

Sampled image reads take `float`, `sycl::vec<float, 2>`, and
`sycl::vec<float, 4>` coordinate types for 1D, 2D, and 3D images, respectively.

Note that in the case of 3D reads or writes, coordinates for 3D images take a
vector of size 4, not 3, as the fourth element in the coordinate vector is
ignored.
`sycl::vec<float, 3>` coordinate types for 1D, 2D, and 3D images, respectively.

Note also that all images must be used in either read-only or write-only fashion
within a single kernel invocation; read/write images are not supported.
Expand All @@ -1061,7 +1057,7 @@ standard types.

* All POD types (`char`, `short`, `int`, `float`, etc.) excluding `double`
* `sycl::half`
* Variants of `sycl::vec<T, N>` where `T` is one of the above, and `N` is `1`, `2`, or `4`
* Variants of `sycl::vec<T, N>` where `T` is one of the above, and `N` is `1`, `2`, or `3`

Any other types are classified as user-defined types.

Expand All @@ -1080,7 +1076,7 @@ struct my_short2 {
```

When providing the above types as `DataT` parameters to an image read function,
the corresponding `HintT` parameters to use would be `sycl::vec<float, 4>` and
the corresponding `HintT` parameters to use would be `sycl::vec<float, 4>` and
`sycl::vec<short, 2>`, respectively.

== Mipmapped images
Expand Down
22 changes: 14 additions & 8 deletions sycl/include/sycl/builtins_preview.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,16 +170,22 @@ auto builtin_default_host_impl(FuncTy F, const Ts &...x) {
template <typename FuncTy, typename... Ts>
auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) {
using T = typename first_type<Ts...>::type;
if constexpr (is_vec_or_swizzle_v<T>) {
using ret_elem_type = decltype(F(x[0]...));
// TODO: using r{} to avoid Werror. Not sure if ok.
vec<ret_elem_type, T::size()> r{};
loop<T::size()>([&](auto idx) { r[idx] = F(x[idx]...); });
return r;
static_assert(is_vec_or_swizzle_v<T> || is_marray_v<T>);

constexpr auto Size = T::size();
using ret_elem_type = decltype(F(x[0]...));
std::conditional_t<is_marray_v<T>, marray<ret_elem_type, Size>,
vec<ret_elem_type, Size>>
r{};

if constexpr (is_marray_v<T>) {
for (size_t i = 0; i < Size; ++i)
r[i] = F(x[i]...);
} else {
static_assert(is_marray_v<T>);
return builtin_marray_impl(F, x...);
loop<Size>([&](auto idx) { r[idx] = F(x[idx]...); });
}

return r;
}

template <typename T>
Expand Down
34 changes: 32 additions & 2 deletions sycl/include/sycl/detail/generic_type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,14 @@ template <> struct ConvertToOpenCLTypeImpl<Boolean<1>> {
// Or should it be "int"?
using type = Boolean<1>;
};
#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
// TODO: It seems we only use this to convert a pointer's element type. As such,
// although it doesn't look very clean, it should be ok having this case handled
// explicitly until further refactoring of this area.
template <> struct ConvertToOpenCLTypeImpl<std::byte> {
using type = uint8_t;
};
#endif
#endif

template <typename T> struct ConvertToOpenCLTypeImpl<T *> {
Expand Down Expand Up @@ -700,8 +708,30 @@ convertDataToType(FROM t) {
// Now fuse the above into a simpler helper that's easy to use.
// TODO: That should probably be moved outside of "type_traits".
template <typename T> auto convertToOpenCLType(T &&x) {
using OpenCLType = ConvertToOpenCLType_t<std::remove_reference_t<T>>;
return convertDataToType<T, OpenCLType>(std::forward<T>(x));
using no_ref = std::remove_reference_t<T>;
if constexpr (is_multi_ptr_v<no_ref>) {
return convertToOpenCLType(x.get_decorated());
} else if constexpr (std::is_pointer_v<no_ref>) {
// TODO: Below ignores volatile, but we didn't have a need for it yet.
using elem_type = remove_decoration_t<std::remove_pointer_t<no_ref>>;
using converted_elem_type_no_cv =
ConvertToOpenCLType_t<std::remove_const_t<elem_type>>;
using converted_elem_type =
std::conditional_t<std::is_const_v<elem_type>,
const converted_elem_type_no_cv,
converted_elem_type_no_cv>;
#ifdef __SYCL_DEVICE_ONLY__
using result_type =
typename DecoratedType<converted_elem_type,
deduce_AS<no_ref>::value>::type *;
#else
using result_type = converted_elem_type *;
#endif
return reinterpret_cast<result_type>(x);
} else {
using OpenCLType = ConvertToOpenCLType_t<no_ref>;
return convertDataToType<T, OpenCLType>(std::forward<T>(x));
}
}

template <typename To, typename From> auto convertFromOpenCLTypeFor(From &&x) {
Expand Down
Loading

0 comments on commit f52745e

Please sign in to comment.