diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp index 62890087d6e2b..f6b9ccf937d41 100644 --- a/sycl/include/syclcompat/util.hpp +++ b/sycl/include/syclcompat/util.hpp @@ -280,6 +280,15 @@ inline int get_sycl_language_version() { } namespace experimental { + +#if defined(__AMDGPU__) || defined(__NVPTX__) +// FIXME: https://github.com/intel/llvm/pull/12516 adds seq_cst support for the +// CUDA backend. +constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel; +#else +constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst; +#endif + /// Synchronize work items from all work groups within a SYCL kernel. /// \param [in] item: Represents a work group. /// \param [in] counter: An atomic object defined on a device memory which can @@ -289,8 +298,8 @@ namespace experimental { /// a SYCL kernel can be scheduled actively at the same time on a device. template inline void nd_range_barrier( - sycl::nd_item item, - sycl::atomic_ref &item, + sycl::atomic_ref &counter) { @@ -329,8 +338,8 @@ inline void nd_range_barrier( /// a SYCL kernel can be scheduled actively at the same time on a device. template <> inline void nd_range_barrier( - sycl::nd_item<1> item, - sycl::atomic_ref &item, + sycl::atomic_ref &counter) { unsigned int num_groups = item.get_group_range(0); diff --git a/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp b/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp index 9a4ebe441352c..c5835654ac9f2 100644 --- a/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp +++ b/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp @@ -40,19 +40,19 @@ #include #include -void kernel_1( - sycl::nd_item<3> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_1(sycl::nd_item<3> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); } -void kernel_2( - sycl::nd_item<3> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_2(sycl::nd_item<3> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); @@ -70,18 +70,18 @@ void test_nd_range_barrier_dim3() { q_ct1 ->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * - sycl::range<3>(1, 1, 4), - sycl::range<3>(1, 1, 4)), - [=](sycl::nd_item<3> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); - kernel_1(item_ct1, atm_sync_ct1); - }); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * + sycl::range<3>(1, 1, 4), + sycl::range<3>(1, 1, 4)), + [=](sycl::nd_item<3> item_ct1) { + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>( + sync_ct1[0]); + kernel_1(item_ct1, atm_sync_ct1); + }); }) .wait(); } @@ -95,37 +95,37 @@ void test_nd_range_barrier_dim3() { q_ct1 ->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * - sycl::range<3>(1, 1, 4), - sycl::range<3>(1, 1, 4)), - [=](sycl::nd_item<3> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); - kernel_2(item_ct1, atm_sync_ct1); - }); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, 4) * + sycl::range<3>(1, 1, 4), + sycl::range<3>(1, 1, 4)), + [=](sycl::nd_item<3> item_ct1) { + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>( + sync_ct1[0]); + kernel_2(item_ct1, atm_sync_ct1); + }); }) .wait(); } dev_ct1.queues_wait_and_throw(); } -void kernel_1( - sycl::nd_item<1> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_1(sycl::nd_item<1> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); } -void kernel_2( - sycl::nd_item<1> item_ct1, - sycl::atomic_ref &sync_ct1) { +void kernel_2(sycl::nd_item<1> item_ct1, + sycl::atomic_ref< + unsigned int, syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space> &sync_ct1) { syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1); @@ -148,11 +148,11 @@ void test_nd_range_barrier_dim1() { sycl::nd_range<1>(sycl::range<1>(4) * sycl::range<1>(4), sycl::range<1>(4)), [=](sycl::nd_item<1> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>(sync_ct1[0]); kernel_1(item_ct1, atm_sync_ct1); }); }) @@ -171,11 +171,11 @@ void test_nd_range_barrier_dim1() { sycl::nd_range<1>(sycl::range<1>(4) * sycl::range<1>(4), sycl::range<1>(4)), [=](sycl::nd_item<1> item_ct1) { - auto atm_sync_ct1 = - sycl::atomic_ref( - sync_ct1[0]); + auto atm_sync_ct1 = sycl::atomic_ref< + unsigned int, + syclcompat::experimental::barrier_memory_order, + sycl::memory_scope::device, + sycl::access::address_space::global_space>(sync_ct1[0]); kernel_2(item_ct1, atm_sync_ct1); }); })