General code cleanup

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
oneapi-src · Dec 19, 2024 · 6cb11c7 · 6cb11c7
1 parent 8ae18db
commit 6cb11c7
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 133 deletions.
diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -46,7 +46,8 @@ __pattern_walk1_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
 
     auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{{}, __f, std::size_t(__n)},
+        unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__view)>{
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view);
     return __future_obj;
 }
@@ -74,7 +75,7 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, std::size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     return __future.__make_future(__first2 + __n);
@@ -106,7 +107,7 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, size_t(__n)},
+                                                decltype(__view3)>{{}, __f, static_cast<size_t>(__n)},
         __n, __view1, __view2, __view3);
 
     return __future.__make_future(__first3 + __n);

diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -111,7 +111,7 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
 
     // Call no wait, wait or deferrable wait depending on _WaitMode
@@ -157,7 +157,7 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Forw
     auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2)>{
-            {}, __f, size_t(__n)},
+            {}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2);
     __future.wait(__par_backend_hetero::__deferrable_mode{});
     return __first2 + __n;
@@ -192,9 +192,9 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt
     auto __view3 = __buf3.all_view();
 
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
         unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__view1), decltype(__view2),
-                                                decltype(__view3)>{{}, __f, size_t(__n)},
+                                                decltype(__view3)>{{}, __f, static_cast<std::size_t>(__n)},
         __n, __view1, __view2, __view3)
         .__deferrable_wait();
 
@@ -1597,8 +1597,8 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato
     auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
     auto __buf = __keep(__first, __last);
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_functor<typename ::std::iterator_traits<_Iterator>::difference_type,
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_functor<typename std::iterator_traits<_Iterator>::difference_type,
                                          decltype(__buf.all_view())>{{}, __n},
         __n / 2, __buf.all_view())
         .__deferrable_wait();
@@ -1626,8 +1626,8 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi
     auto __view1 = __buf1.all_view();
     auto __view2 = __buf2.all_view();
     oneapi::dpl::__par_backend_hetero::__parallel_for(
-        _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec),
-        unseq_backend::__reverse_copy<typename ::std::iterator_traits<_BidirectionalIterator>::difference_type,
+        _BackendTag{}, std::forward<_ExecutionPolicy>(__exec),
+        unseq_backend::__reverse_copy<typename std::iterator_traits<_BidirectionalIterator>::difference_type,
                                       decltype(__view1), decltype(__view2)>{{}, __n},
         __n, __view1, __view2)
         .__deferrable_wait();
@@ -1670,7 +1670,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     const auto __shift = __new_first - __first;
     oneapi::dpl::__par_backend_hetero::__parallel_for(
         _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec),
-        unseq_backend::__rotate_copy<typename ::std::iterator_traits<_Iterator>::difference_type, decltype(__view),
+        unseq_backend::__rotate_copy<typename std::iterator_traits<_Iterator>::difference_type, decltype(__view),
                                      decltype(__temp_rng_w)>{{}, __n, __shift},
         __n, __view, __temp_rng_w);
 
@@ -1683,7 +1683,7 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator
     auto __brick = unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw),
                                                            decltype(__buf.all_view())>{
         {}, _Function{}, static_cast<std::size_t>(__n)};
-    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick,
+    oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick,
                                                       __n, __temp_rng_rw, __buf.all_view())
         .__deferrable_wait();
 

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -28,7 +28,6 @@
 #include <cmath>
 #include <limits>
 #include <cstdint>
-#include <tuple>
 
 #include "../../iterator_impl.h"
 #include "../../execution_impl.h"

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h
@@ -60,7 +60,7 @@ struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name..
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
 
@@ -142,20 +142,20 @@ struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name..
     {
         assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0);
         _PRINT_INFO_IN_DEBUG_MODE(__exec);
-        auto __event = __exec.queue().submit([&__rngs..., &__brick, &__exec, __count](sycl::handler& __cgh) {
+        auto __event = __exec.queue().submit([__rngs..., __brick, __exec, __count](sycl::handler& __cgh) {
             //get an access to data under SYCL buffer:
             oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
             constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item;
             const std::size_t __work_group_size =
                 oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size);
             const std::size_t __num_groups =
-                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * decltype(__brick)::__preferred_vector_size * __iters_per_work_item));
+                oneapi::dpl::__internal::__dpl_ceiling_div(__count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item));
             const std::size_t __num_items = __num_groups * __work_group_size;
             __cgh.parallel_for<_Name...>(
                 sycl::nd_range(sycl::range<1>(__num_items), sycl::range<1>(__work_group_size)),
                 [=](sycl::nd_item</*dim=*/1> __item) {
                     auto [__idx, __stride, __is_full] =
-                        __stride_recommender(__item, __count, __iters_per_work_item, decltype(__brick)::__preferred_vector_size, __work_group_size);
+                        __stride_recommender(__item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size);
                     __strided_loop<__iters_per_work_item> __execute_loop{static_cast<std::size_t>(__count)};
                     if (__is_full)
                     {
@@ -190,7 +190,7 @@ __parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&&
     // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a
     // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1,
     // then only compile the basic kernel as the two versions are effectively the same.
-    if constexpr (_Fp::__preferred_iters_per_item > 1)
+    if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1)
     {
         if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick))
         {

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -845,26 +845,26 @@ struct __lazy_load_transform_op
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_load
 {
-    static_assert(__vec_size <= 4);
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
     template <typename _IdxType, typename _LoadOp, typename... _Acc>
     void
     operator()(std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
             __load_op(__start_idx + __i, __i, __acc...);
     }
 
     template <typename _IdxType, typename _LoadOp, typename... _Acc>
     void
     operator()(std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Acc... __acc) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
             __load_op(__start_idx + __i, __i, __acc...);
     }
 };
@@ -891,20 +891,19 @@ struct __lazy_store_transform_op
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_walk
 {
-    static_assert(__vec_size <= 4);
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
 
     template <typename _IdxType, typename _WalkFunction, typename... _Rngs>
     void
     operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
         {
-
             __f(__rngs[__idx + __i]...);
         }
     }
@@ -914,61 +913,63 @@ struct __vector_walk
     void
     operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
         {
             __f(__rngs[__idx + __i]...);
         }
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_store
 {
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     std::size_t __n;
-    static_assert(__vec_size <= 4);
-    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __vec_size; ++__i)
-            __store_op(__i, __start_idx + __i, __acc...);
+        for (std::uint8_t __i = 0; __i < __vec_size; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
     }
-    template <typename _IdxType, typename _StoreOp, typename... _Acc>
+    template <typename _IdxType, typename _StoreOp, typename... _Rngs>
     void
-    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Acc... __acc) const
+    operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs... __rngs) const
     {
-        std::uint16_t __elements = std::min(__vec_size, decltype(__vec_size)(__n - __start_idx));
-        for (std::uint16_t __i = 0; __i < __elements; ++__i)
-            __store_op(__i, __start_idx + __i, __acc...);
+        std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__n - __start_idx});
+        for (std::uint8_t __i = 0; __i < __elements; ++__i)
+            __store_op(__i, __start_idx + __i, __rngs...);
     }
 };
 
-template <std::uint16_t __vec_size>
+template <std::uint8_t __vec_size>
 struct __vector_reverse
 {
+    static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported");
     template <typename _IsFull, typename _Idx, typename _Array>
     void
     operator()(_IsFull __is_full, const _Idx __elements_to_process, _Array __array) const
     {
         if constexpr (__is_full)
         {
             _ONEDPL_PRAGMA_UNROLL
-            for (std::uint16_t __i = 0; __i != __vec_size / 2; ++__i)
+            for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i)
                 std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v);
         }
         else
         {
-            for (std::uint16_t __i = 0; __i != __elements_to_process / 2; ++__i)
+            for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i)
                 std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v);
         }
     }
 };
 
 // Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns
 // (potentially with vectorization)
-template <std::uint16_t __num_strides>
+template <std::uint8_t __num_strides>
 struct __strided_loop
 {
     std::size_t __n;
@@ -978,7 +979,7 @@ struct __strided_loop
                _Ranges&&... __rngs) const
     {
         _ONEDPL_PRAGMA_UNROLL
-        for (std::uint16_t __i = 0; __i < __num_strides; ++__i)
+        for (std::uint8_t __i = 0; __i < __num_strides; ++__i)
         {
             __loop_body_op(std::true_type{}, __idx, __rngs...);
             __idx += __stride;
@@ -992,7 +993,7 @@ struct __strided_loop
         // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation
         const std::uint8_t __adjusted_iters_per_work_item =
             oneapi::dpl::__internal::__dpl_ceiling_div(__n - __idx, __stride);
-        for (std::uint16_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
+        for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i)
         {
             __loop_body_op(std::false_type{}, __idx, __rngs...);
             __idx += __stride;