From 9d1cbc51854f19f89105d502db9156b11e4507f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Tue, 30 Apr 2024 08:25:52 +0100 Subject: [PATCH] [SYCL][COMPAT] nd_range barriers seq_cst by default in supported devices (#12974) AMD ~~and CUDA~~ devices still not supported. ~~CUDA to be supported in https://github.com/intel/llvm/pull/12516~~ Edit: Since #12516 has been merged, CUDA is also `seq_cst` by default. --- sycl/doc/syclcompat/README.md | 11 +- sycl/include/syclcompat/util.hpp | 17 ++- .../util/util_nd_range_barrier_test.cpp | 108 +++++++++--------- 3 files changed, 76 insertions(+), 60 deletions(-) diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index 396be8ac13e8b..6ab55611c12c7 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -1184,17 +1184,24 @@ spec, and so should be used with caution. namespace syclcompat { namespace experimental { +#if defined(__AMDGPU__) +// seq_cst currently not working for AMD +constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel; +#else +constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst; +#endif + template inline void nd_range_barrier( sycl::nd_item item, - sycl::atomic_ref &counter); template <> inline void nd_range_barrier( sycl::nd_item<1> item, - sycl::atomic_ref &counter); diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp index 170be2671bcce..8ce3b29d5b14f 100644 --- a/sycl/include/syclcompat/util.hpp +++ b/sycl/include/syclcompat/util.hpp @@ -493,6 +493,15 @@ inline int get_sycl_language_version() { } namespace experimental { + +// FIXME(@intel/syclcompat-lib-reviewers): unify once supported in the AMD +// backend. +#if defined(__AMDGPU__) +constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel; +#else +constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst; +#endif + /// Synchronize work items from all work groups within a SYCL kernel. /// \param [in] item: Represents a work group. /// \param [in] counter: An atomic object defined on a device memory which can @@ -502,8 +511,8 @@ namespace experimental { /// a SYCL kernel can be scheduled actively at the same time on a device. template inline void nd_range_barrier( - sycl::nd_item item, - sycl::atomic_ref &item, + sycl::atomic_ref &counter) { @@ -542,8 +551,8 @@ inline void nd_range_barrier( /// a SYCL kernel can be scheduled actively at the same time on a device. template <> inline void nd_range_barrier( - sycl::nd_item<1> item, - sycl::atomic_ref &item, + sycl::atomic_ref &counter) { unsigned int num_groups = item.get_group_range(0); diff --git a/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp b/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp index 9a4ebe441352c..c5835654ac9f2 100644 --- a/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp +++ b/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp @@ -40,19 +40,19 @@ #include #include -void kernel_1( - sycl::nd_item<3> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_1(sycl::nd_item<3> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); } -void kernel_2( - sycl::nd_item<3> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_2(sycl::nd_item<3> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); @@ -70,18 +70,18 @@ void test_nd_range_barrier_dim3() { q_ct1 ->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * - sycl::range<3>(1, 1, 4), - sycl::range<3>(1, 1, 4)), - [=](sycl::nd_item<3> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); - kernel_1(item_ct1, atm_sync_ct1); - }); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * + sycl::range<3>(1, 1, 4), + sycl::range<3>(1, 1, 4)), + [=](sycl::nd_item<3> item_ct1) { + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>( + sync_ct1[0]); + kernel_1(item_ct1, atm_sync_ct1); + }); }) .wait(); } @@ -95,37 +95,37 @@ void test_nd_range_barrier_dim3() { q_ct1 ->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * - sycl::range<3>(1, 1, 4), - sycl::range<3>(1, 1, 4)), - [=](sycl::nd_item<3> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); - kernel_2(item_ct1, atm_sync_ct1); - }); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * + sycl::range<3>(1, 1, 4), + sycl::range<3>(1, 1, 4)), + [=](sycl::nd_item<3> item_ct1) { + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>( + sync_ct1[0]); + kernel_2(item_ct1, atm_sync_ct1); + }); }) .wait(); } dev_ct1.queues_wait_and_throw(); } -void kernel_1( - sycl::nd_item<1> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_1(sycl::nd_item<1> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); } -void kernel_2( - sycl::nd_item<1> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_2(sycl::nd_item<1> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); @@ -148,11 +148,11 @@ void test_nd_range_barrier_dim1() { sycl::nd_range<1>(sycl::range<1>(4) * sycl::range<1>(4), sycl::range<1>(4)), [=](sycl::nd_item<1> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>(sync_ct1[0]); kernel_1(item_ct1, atm_sync_ct1); }); }) @@ -171,11 +171,11 @@ void test_nd_range_barrier_dim1() { sycl::nd_range<1>(sycl::range<1>(4) * sycl::range<1>(4), sycl::range<1>(4)), [=](sycl::nd_item<1> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>(sync_ct1[0]); kernel_2(item_ct1, atm_sync_ct1); }); })