From 7999e27b155017d22990d4037b5c4d83c0ae44ba Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Tue, 13 Feb 2024 15:45:10 -0800 Subject: [PATCH] [SYCL] Make nd_item stateless (#12236) The primary motivation for this change is to reduce the overhead of nd_item object creation. The nd_item object is created at the beginning of each "nd-range kernel". nd_item constructor initializes following members: - global item (global id, global range, global offset) - local item (local id, local range) - group (global range, local range, number of groups, group id) Most applications do not use all these data, so initializing them is unnecessary overhead. Due to compiler optimizations like aggressive inlining, SROA and dead code elimination, the overhead can be avoided in some cases. This patch removes all nd_item members and uses SPIR-V intrinsics to get access to the data we keep as nd_item members. This achieved though following changes: 1. group class member functions async_workg_group_copy and wait_for are inlined to nd_item class. 2. global and local item members are removed. The data obtained via SPIR-V instrinsics. --- sycl/include/sycl/nd_item.hpp | 506 +++++++++++++++++++++++++++++++++- 1 file changed, 503 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/nd_item.hpp b/sycl/include/sycl/nd_item.hpp index 26d1f4351c915..1382a12040b66 100644 --- a/sycl/include/sycl/nd_item.hpp +++ b/sycl/include/sycl/nd_item.hpp @@ -10,10 +10,13 @@ #include // for __spirv_ControlBarrier #include // for Scope +#include // for initLocalInvocationId #include // for mode, fence_space #include // for __SYCL_ASSUME_INT #include // for __SYCL2020_DEPRECATED, __SY... +#include // for ConvertToOpenCLType_t #include // for getSPIRVMemorySemanticsMask +#include // for is_bool, change_base_... #include // for device_event #include // for make_error_code, errc, exce... #include // for group @@ -38,6 +41,502 @@ namespace ext::oneapi::experimental { template class root_group; } +#if __INTEL_PREVIEW_BREAKING_CHANGES +/// Identifies an instance of the function object executing at each point in an +/// nd_range. +/// +/// \ingroup sycl_api +template class nd_item { +public: + static constexpr int dimensions = Dimensions; + + id get_global_id() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv::initGlobalInvocationId>(); +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "nd_item methods can't be invoked on the host"); + return {}; +#endif + } + + size_t __SYCL_ALWAYS_INLINE get_global_id(int Dimension) const { + size_t Id = get_global_id()[Dimension]; + __SYCL_ASSUME_INT(Id); + return Id; + } + + size_t __SYCL_ALWAYS_INLINE get_global_linear_id() const { + size_t LinId = 0; + id Index = get_global_id(); + range Extent = get_global_range(); + id Offset = get_offset(); + if (1 == Dimensions) { + LinId = Index[0] - Offset[0]; + } else if (2 == Dimensions) { + LinId = (Index[0] - Offset[0]) * Extent[1] + Index[1] - Offset[1]; + } else { + LinId = (Index[0] - Offset[0]) * Extent[1] * Extent[2] + + (Index[1] - Offset[1]) * Extent[2] + Index[2] - Offset[2]; + } + __SYCL_ASSUME_INT(LinId); + return LinId; + } + + id get_local_id() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv::initLocalInvocationId>(); +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "nd_item methods can't be invoked on the host"); + return {}; +#endif + } + + size_t __SYCL_ALWAYS_INLINE get_local_id(int Dimension) const { + size_t Id = get_local_id()[Dimension]; + __SYCL_ASSUME_INT(Id); + return Id; + } + + size_t get_local_linear_id() const { + size_t LinId = 0; + id Index = get_local_id(); + range Extent = get_local_range(); + if (1 == Dimensions) { + LinId = Index[0]; + } else if (2 == Dimensions) { + LinId = Index[0] * Extent[1] + Index[1]; + } else { + LinId = + Index[0] * Extent[1] * Extent[2] + Index[1] * Extent[2] + Index[2]; + } + __SYCL_ASSUME_INT(LinId); + return LinId; + } + + group get_group() const { + // TODO: ideally Group object should be stateless and have a contructor with + // no arguments. + return detail::Builder::createGroup(get_global_range(), get_local_range(), + get_group_range(), get_group_id()); + } + + sub_group get_sub_group() const { return sub_group(); } + + size_t __SYCL_ALWAYS_INLINE get_group(int Dimension) const { + size_t Id = get_group_id()[Dimension]; + __SYCL_ASSUME_INT(Id); + return Id; + } + + size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const { + size_t LinId = 0; + id Index = get_group_id(); + range Extent = get_group_range(); + if (1 == Dimensions) { + LinId = Index[0]; + } else if (2 == Dimensions) { + LinId = Index[0] * Extent[1] + Index[1]; + } else { + LinId = + Index[0] * Extent[1] * Extent[2] + Index[1] * Extent[2] + Index[2]; + } + __SYCL_ASSUME_INT(LinId); + return LinId; + } + + range get_group_range() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv::initNumWorkgroups>(); +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "nd_item methods can't be invoked on the host"); + return {}; +#endif + } + + size_t __SYCL_ALWAYS_INLINE get_group_range(int Dimension) const { + size_t Range = get_group_range()[Dimension]; + __SYCL_ASSUME_INT(Range); + return Range; + } + + range get_global_range() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv::initGlobalSize>(); +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "nd_item methods can't be invoked on the host"); + return {}; +#endif + } + + size_t get_global_range(int Dimension) const { + size_t Val = get_global_range()[Dimension]; + __SYCL_ASSUME_INT(Val); + return Val; + } + + range get_local_range() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv::initWorkgroupSize>(); +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "nd_item methods can't be invoked on the host"); + return {}; +#endif + } + + size_t get_local_range(int Dimension) const { + size_t Id = get_local_range()[Dimension]; + __SYCL_ASSUME_INT(Id); + return Id; + } + + __SYCL2020_DEPRECATED("offsets are deprecated in SYCL 2020") + id get_offset() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv::initGlobalOffset>(); +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "nd_item methods can't be invoked on the host"); + return {}; +#endif + } + + nd_range get_nd_range() const { + return nd_range(get_global_range(), get_local_range(), + get_offset()); + } + + void barrier(access::fence_space accessSpace = + access::fence_space::global_and_local) const { + uint32_t flags = _V1::detail::getSPIRVMemorySemanticsMask(accessSpace); + __spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup, + flags); + } + + /// Executes a work-group mem-fence with memory ordering on the local address + /// space, global address space or both based on the value of \p accessSpace. + template + __SYCL2020_DEPRECATED("use sycl::atomic_fence() free function instead") + void mem_fence( + typename std::enable_if_t + accessSpace = access::fence_space::global_and_local) const { + uint32_t flags = detail::getSPIRVMemorySemanticsMask(accessSpace); + // TODO: currently, there is no good way in SPIR-V to set the memory + // barrier only for load operations or only for store operations. + // The full read-and-write barrier is used and the template parameter + // 'accessMode' is ignored for now. Either SPIR-V or SYCL spec may be + // changed to address this discrepancy between SPIR-V and SYCL, + // or if we decide that 'accessMode' is the important feature then + // we can fix this later, for example, by using OpenCL 1.2 functions + // read_mem_fence() and write_mem_fence(). + __spirv_MemoryBarrier(__spv::Scope::Workgroup, flags); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest + /// with a source stride specified by \p srcStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for dataT are all scalar and vector types, except boolean. + template + __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") + std::enable_if_t::value, + device_event> async_work_group_copy(local_ptr dest, + global_ptr src, + size_t numElements, + size_t srcStride) const { + using DestT = detail::ConvertToOpenCLType_t; + using SrcT = detail::ConvertToOpenCLType_t; + + __ocl_event_t E = __SYCL_OpGroupAsyncCopyGlobalToLocal( + __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), + numElements, srcStride, 0); + return device_event(E); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest with + /// the destination stride specified by \p destStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for dataT are all scalar and vector types, except boolean. + template + __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") + std::enable_if_t::value, + device_event> async_work_group_copy(global_ptr dest, + local_ptr src, + size_t numElements, + size_t destStride) + const { + using DestT = detail::ConvertToOpenCLType_t; + using SrcT = detail::ConvertToOpenCLType_t; + + __ocl_event_t E = __SYCL_OpGroupAsyncCopyLocalToGlobal( + __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), + numElements, destStride, 0); + return device_event(E); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest + /// with a source stride specified by \p srcStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for DestDataT are all scalar and vector types, except + /// boolean. SrcDataT must be either the same as DestDataT or const DestDataT. + template + std::enable_if_t::value && + std::is_same_v, DestDataT>, + device_event> + async_work_group_copy(decorated_local_ptr dest, + decorated_global_ptr src, size_t numElements, + size_t srcStride) const { + using DestT = detail::ConvertToOpenCLType_t; + using SrcT = detail::ConvertToOpenCLType_t; + + __ocl_event_t E = __SYCL_OpGroupAsyncCopyGlobalToLocal( + __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), + numElements, srcStride, 0); + return device_event(E); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest with + /// the destination stride specified by \p destStride, and returns a SYCL + /// device_event which can be used to wait on the completion of the copy. + /// Permitted types for DestDataT are all scalar and vector types, except + /// boolean. SrcDataT must be either the same as DestDataT or const DestDataT. + template + std::enable_if_t::value && + std::is_same_v, DestDataT>, + device_event> + async_work_group_copy(decorated_global_ptr dest, + decorated_local_ptr src, size_t numElements, + size_t destStride) const { + using DestT = detail::ConvertToOpenCLType_t; + using SrcT = detail::ConvertToOpenCLType_t; + + __ocl_event_t E = __SYCL_OpGroupAsyncCopyLocalToGlobal( + __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()), + numElements, destStride, 0); + return device_event(E); + } + + /// Specialization for scalar bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") + std::enable_if_t< + detail::is_scalar_bool::value, + device_event> async_work_group_copy(multi_ptr + Dest, + multi_ptr + Src, + size_t NumElements, + size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + auto DestP = multi_ptr( + reinterpret_cast(Dest.get())); + auto SrcP = multi_ptr( + reinterpret_cast(Src.get())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); + } + + /// Specialization for vector bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") + std::enable_if_t< + detail::is_vector_bool::value, + device_event> async_work_group_copy(multi_ptr + Dest, + multi_ptr + Src, + size_t NumElements, + size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + using VecT = detail::change_base_type_t; + auto DestP = address_space_cast( + reinterpret_cast(Dest.get())); + auto SrcP = address_space_cast( + reinterpret_cast(Src.get())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); + } + + /// Specialization for scalar bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + std::enable_if_t::value && + std::is_same_v, DestT>, + device_event> + async_work_group_copy(multi_ptr Dest, + multi_ptr Src, + size_t NumElements, size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + using QualSrcT = + std::conditional_t, const uint8_t, uint8_t>; + auto DestP = multi_ptr( + detail::cast_AS::pointer>( + Dest.get_decorated())); + auto SrcP = multi_ptr( + detail::cast_AS::pointer>( + Src.get_decorated())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); + } + + /// Specialization for vector bool type. + /// Asynchronously copies a number of elements specified by \p NumElements + /// from the source pointed by \p Src to destination pointed by \p Dest + /// with a stride specified by \p Stride, and returns a SYCL device_event + /// which can be used to wait on the completion of the copy. + template + std::enable_if_t::value && + std::is_same_v, DestT>, + device_event> + async_work_group_copy(multi_ptr Dest, + multi_ptr Src, + size_t NumElements, size_t Stride) const { + static_assert(sizeof(bool) == sizeof(uint8_t), + "Async copy to/from bool memory is not supported."); + using VecT = detail::change_base_type_t; + using QualSrcVecT = + std::conditional_t, std::add_const_t, VecT>; + auto DestP = multi_ptr( + detail::cast_AS< + typename multi_ptr::pointer>( + Dest.get_decorated())); + auto SrcP = multi_ptr( + detail::cast_AS::pointer>( + Src.get_decorated())); + return async_work_group_copy(DestP, SrcP, NumElements, Stride); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for dataT are all scalar and vector types. + template + __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") + device_event + async_work_group_copy(local_ptr dest, global_ptr src, + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for dataT are all scalar and vector types. + template + __SYCL2020_DEPRECATED("Use decorated multi_ptr arguments instead") + device_event + async_work_group_copy(global_ptr dest, local_ptr src, + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for DestDataT are all scalar and vector types. SrcDataT + /// must be either the same as DestDataT or const DestDataT. + template + typename std::enable_if_t< + std::is_same_v>, device_event> + async_work_group_copy(decorated_local_ptr dest, + decorated_global_ptr src, + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); + } + + /// Asynchronously copies a number of elements specified by \p numElements + /// from the source pointed by \p src to destination pointed by \p dest and + /// returns a SYCL device_event which can be used to wait on the completion + /// of the copy. + /// Permitted types for DestDataT are all scalar and vector types. SrcDataT + /// must be either the same as DestDataT or const DestDataT. + template + typename std::enable_if_t< + std::is_same_v>, device_event> + async_work_group_copy(decorated_global_ptr dest, + decorated_local_ptr src, + size_t numElements) const { + return async_work_group_copy(dest, src, numElements, 1); + } + + template void wait_for(eventTN... events) const { + waitForHelper(events...); + } + + sycl::ext::oneapi::experimental::root_group + ext_oneapi_get_root_group() const { + return sycl::ext::oneapi::experimental::root_group{*this}; + } + + nd_item(const nd_item &rhs) = default; + nd_item(nd_item &&rhs) = default; + + nd_item &operator=(const nd_item &rhs) = default; + nd_item &operator=(nd_item &&rhs) = default; + + bool operator==(const nd_item &) const { return true; } + bool operator!=(const nd_item &rhs) const { return !((*this) == rhs); } + +protected: + friend class detail::Builder; + nd_item() {} + nd_item(const item &, const item &, + const group &) {} + + void waitForHelper() const {} + + void waitForHelper(device_event Event) const { Event.wait(); } + + template + void waitForHelper(T E, Ts... Es) const { + waitForHelper(E); + waitForHelper(Es...); + } + + id get_group_id() const { +#ifdef __SYCL_DEVICE_ONLY__ + return __spirv::initWorkgroupId>(); +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "nd_item methods can't be invoked on the host"); + return {}; +#endif + } +}; +#else /// Identifies an instance of the function object executing at each point in an /// nd_range. /// @@ -81,9 +580,9 @@ template class nd_item { sub_group get_sub_group() const { return sub_group(); } size_t __SYCL_ALWAYS_INLINE get_group(int Dimension) const { - size_t Size = Group[Dimension]; - __SYCL_ASSUME_INT(Size); - return Size; + size_t Id = Group[Dimension]; + __SYCL_ASSUME_INT(Id); + return Id; } size_t __SYCL_ALWAYS_INLINE get_group_linear_id() const { @@ -246,6 +745,7 @@ template class nd_item { item localItem; group Group; }; +#endif template __SYCL_DEPRECATED("use sycl::ext::oneapi::experimental::this_nd_item() instead")