Skip to content

Commit

Permalink
[SYCL] Add tests for atomic operations on malloc_shared-allocated mem…
Browse files Browse the repository at this point in the history
…ory (#12480)

This duplicates existing SYCL atomic tests that use buffers with
identical tests that use USM and memory allocated using
`sycl::malloc_shared`. The motivation behind exercising atomics on
`malloc_shared`-allocated memory is to catch issues such as
ROCm/ROCm#848 where native atomic instructions
may fail when crossing the PCIe bus.
  • Loading branch information
pasaulais authored Feb 29, 2024
1 parent 68fd440 commit 42c0092
Show file tree
Hide file tree
Showing 12 changed files with 839 additions and 0 deletions.
187 changes: 187 additions & 0 deletions sycl/test-e2e/AtomicRef/add.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,47 @@ void add_fetch_test(queue q, size_t N) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_fetch_test_usm_shared(queue q, size_t N) {
T *sum = malloc_shared<T>(1, q);
T *output = malloc_shared<T>(N, q);
T *output_begin = &output[0], *output_end = &output[N];
sum[0] = T(0);
std::fill(output_begin, output_end, T(0));
{
q.submit([&](handler &cgh) {
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = AtomicRef < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, space > (sum[0]);
output[gid] = atm.fetch_add(Difference(1), order);
});
}).wait_and_throw();
}

// All work-items increment by 1, so final value should be equal to N.
assert(sum[0] == T(N));

// Fetch returns original value: will be in [0, N-1].
auto min_e = std::min_element(output_begin, output_end);
auto max_e = std::max_element(output_begin, output_end);
assert(*min_e == T(0) && *max_e == T(N - 1));

// Intermediate values should be unique.
std::sort(output_begin, output_end);
assert(std::unique(output_begin, output_end) == output_end);

free(sum, q);
free(output, q);
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T, typename Difference = T,
Expand Down Expand Up @@ -146,6 +187,47 @@ void add_plus_equal_test(queue q, size_t N) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_plus_equal_test_usm_shared(queue q, size_t N) {
T *sum = malloc_shared<T>(1, q);
T *output = malloc_shared<T>(N, q);
T *output_begin = &output[0], *output_end = &output[N];
sum[0] = T(0);
std::fill(output_begin, output_end, T(0));
{
q.submit([&](handler &cgh) {
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = AtomicRef < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, space > (sum[0]);
output[gid] = atm += Difference(1);
});
}).wait_and_throw();
}

// All work-items increment by 1, so final value should be equal to N.
assert(sum[0] == T(N));

// += returns updated value: will be in [1, N].
auto min_e = std::min_element(output_begin, output_end);
auto max_e = std::max_element(output_begin, output_end);
assert(*min_e == T(1) && *max_e == T(N));

// Intermediate values should be unique.
std::sort(output_begin, output_end);
assert(std::unique(output_begin, output_end) == output_end);

free(sum, q);
free(output, q);
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T, typename Difference = T,
Expand Down Expand Up @@ -188,6 +270,46 @@ void add_pre_inc_test(queue q, size_t N) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_pre_inc_test_usm_shared(queue q, size_t N) {
T *sum = malloc_shared<T>(1, q);
T *output = malloc_shared<T>(N, q);
T *output_begin = &output[0], *output_end = &output[N];
sum[0] = T(0);
{
q.submit([&](handler &cgh) {
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = AtomicRef < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, space > (sum[0]);
output[gid] = ++atm;
});
}).wait_and_throw();
}

// All work-items increment by 1, so final value should be equal to N.
assert(sum[0] == T(N));

// Pre-increment returns updated value: will be in [1, N].
auto min_e = std::min_element(output_begin, output_end);
auto max_e = std::max_element(output_begin, output_end);
assert(*min_e == T(1) && *max_e == T(N));

// Intermediate values should be unique.
std::sort(output_begin, output_end);
assert(std::unique(output_begin, output_end) == output_end);

free(sum, q);
free(output, q);
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T, typename Difference = T,
Expand Down Expand Up @@ -230,6 +352,46 @@ void add_post_inc_test(queue q, size_t N) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void add_post_inc_test_usm_shared(queue q, size_t N) {
T *sum = malloc_shared<T>(1, q);
T *output = malloc_shared<T>(N, q);
T *output_begin = &output[0], *output_end = &output[N];
sum[0] = T(0);
{
q.submit([&](handler &cgh) {
cgh.parallel_for(range<1>(N), [=](item<1> it) {
int gid = it.get_id(0);
auto atm = AtomicRef < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, space > (sum[0]);
output[gid] = atm++;
});
}).wait_and_throw();
}

// All work-items increment by 1, so final value should be equal to N.
assert(sum[0] == T(N));

// Post-increment returns original value: will be in [0, N-1].
auto min_e = std::min_element(output_begin, output_end);
auto max_e = std::max_element(output_begin, output_end);
assert(*min_e == T(0) && *max_e == T(N - 1));

// Intermediate values should be unique.
std::sort(output_begin, output_end);
assert(std::unique(output_begin, output_end) == output_end);

free(sum, q);
free(output, q);
}

template <access::address_space space, typename T, typename Difference = T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
Expand All @@ -241,6 +403,7 @@ void add_test(queue q, size_t N) {
space == access::address_space::global_space ||
(space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
constexpr bool do_ext_tests = space != access::address_space::generic_space;
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
if constexpr (do_local_tests) {
#ifdef RUN_DEPRECATED
if constexpr (do_ext_tests) {
Expand All @@ -259,23 +422,47 @@ void add_test(queue q, size_t N) {
order, scope>(q, N);
add_plus_equal_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
order, scope>(q, N);
if (do_usm_tests) {
add_fetch_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space, T,
Difference, order, scope>(q, N);
add_plus_equal_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space,
T, Difference, order, scope>(q, N);
}
if constexpr (!std::is_floating_point_v<T>) {
add_pre_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
order, scope>(q, N);
add_post_inc_test<::sycl::ext::oneapi::atomic_ref, space, T, Difference,
order, scope>(q, N);
if (do_usm_tests) {
add_pre_inc_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space, T,
Difference, order, scope>(q, N);
add_post_inc_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space,
T, Difference, order, scope>(q, N);
}
}
}
#else
add_fetch_test<::sycl::atomic_ref, space, T, Difference, order, scope>(q,
N);
add_plus_equal_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
q, N);
if (do_usm_tests) {
add_fetch_test_usm_shared<::sycl::atomic_ref, space, T, Difference, order,
scope>(q, N);
add_plus_equal_test_usm_shared<::sycl::atomic_ref, space, T, Difference,
order, scope>(q, N);
}
if constexpr (!std::is_floating_point_v<T>) {
add_pre_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
q, N);
add_post_inc_test<::sycl::atomic_ref, space, T, Difference, order, scope>(
q, N);
if (do_usm_tests) {
add_pre_inc_test_usm_shared<::sycl::atomic_ref, space, T, Difference,
order, scope>(q, N);
add_post_inc_test_usm_shared<::sycl::atomic_ref, space, T, Difference,
order, scope>(q, N);
}
}
#endif
}
Expand Down
46 changes: 46 additions & 0 deletions sycl/test-e2e/AtomicRef/and.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,44 @@ void and_global_test(queue q) {
assert(std::unique(output.begin(), output.end()) == output.end());
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space space, typename T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
void and_global_test_usm_shared(queue q) {
const size_t N = 32;
const T initial = T((1ll << N) - 1);
T *cum = malloc_shared<T>(1, q);
cum[0] = initial;
T *output = malloc_shared<T>(N, q);
T *output_begin = &output[0], *output_end = &output[N];
std::fill(output_begin, output_end, T(0));
{
q.submit([&](handler &cgh) {
cgh.parallel_for(range<1>(N), [=](item<1> it) {
size_t gid = it.get_id(0);
auto atm = AtomicRef < T,
(order == memory_order::acquire || order == memory_order::release)
? memory_order::relaxed
: order,
scope, space > (cum[0]);
output[gid] = atm.fetch_and(~T(1ll << gid), order);
});
}).wait_and_throw();
}

// Final value should be equal to 0.
assert(cum[0] == 0);

// All other values should be unique; each work-item sets one bit to 0.
std::sort(output_begin, output_end);
assert(std::unique(output_begin, output_end) == output_end);

free(cum, q);
free(output, q);
}

template <access::address_space space, typename T,
memory_order order = memory_order::relaxed,
memory_scope scope = memory_scope::device>
Expand All @@ -108,6 +146,7 @@ void and_test(queue q) {
space == access::address_space::global_space ||
(space == access::address_space::generic_space && !TEST_GENERIC_IN_LOCAL);
constexpr bool do_ext_tests = space != access::address_space::generic_space;
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
if constexpr (do_local_tests) {
#ifdef RUN_DEPRECATED
if constexpr (do_ext_tests) {
Expand All @@ -123,9 +162,16 @@ void and_test(queue q) {
if constexpr (do_ext_tests) {
and_global_test<::sycl::ext::oneapi::atomic_ref, space, T, order, scope>(
q);
if (do_usm_tests) {
and_global_test_usm_shared<::sycl::ext::oneapi::atomic_ref, space, T,
order, scope>(q);
}
}
#else
and_global_test<::sycl::atomic_ref, space, T, order, scope>(q);
if (do_usm_tests) {
and_global_test_usm_shared<::sycl::atomic_ref, space, T, order, scope>(q);
}
#endif
}
}
Expand Down
46 changes: 46 additions & 0 deletions sycl/test-e2e/AtomicRef/assignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,63 @@ void assignment_test(queue q, size_t N) {
assert(assignment >= T(0) && assignment <= T(N - 1));
}

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space address_space, typename T>
class assignment_usm_kernel;

template <template <typename, memory_order, memory_scope, access::address_space>
class AtomicRef,
access::address_space address_space, typename T>
void assignment_test_usm_shared(queue q, size_t N) {
T initial = T(N);
T *st = malloc_shared<T>(1, q);
st[0] = initial;
{
q.submit([&](handler &cgh) {
cgh.parallel_for<assignment_usm_kernel<AtomicRef, address_space, T>>(
range<1>(N), [=](item<1> it) {
size_t gid = it.get_id(0);
auto atm = AtomicRef<T, memory_order::relaxed,
memory_scope::device, address_space>(st[0]);
atm = T(gid);
});
}).wait_and_throw();
}

// The initial value should have been overwritten by a work-item ID.
// Atomicity isn't tested here, but support for assignment() is.
assert(st[0] != initial);
assert(st[0] >= T(0) && st[0] <= T(N - 1));

free(st, q);
}

template <typename T> void assignment_test(queue q, size_t N) {
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
#ifdef RUN_DEPRECATED
assignment_test<::sycl::ext::oneapi::atomic_ref,
access::address_space::global_space, T>(q, N);
if (do_usm_tests) {
assignment_test_usm_shared<::sycl::ext::oneapi::atomic_ref,
access::address_space::global_space, T>(q, N);
}
#else
assignment_test<::sycl::atomic_ref, access::address_space::global_space, T>(
q, N);
if (do_usm_tests) {
assignment_test_usm_shared<::sycl::atomic_ref,
access::address_space::global_space, T>(q, N);
}
#endif
}

template <typename T> void assignment_generic_test(queue q, size_t N) {
bool do_usm_tests = q.get_device().has(aspect::usm_shared_allocations);
assignment_test<::sycl::atomic_ref, access::address_space::generic_space, T>(
q, N);
if (do_usm_tests) {
assignment_test_usm_shared<::sycl::atomic_ref,
access::address_space::generic_space, T>(q, N);
}
}
Loading

0 comments on commit 42c0092

Please sign in to comment.