Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Rewrite tests that fail when usm_shared_allocations not supported #2 #12655

Merged
merged 23 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
// the tests, please check if they pass without the discard_events property, if
// they don't pass then it's most likely a general issue unrelated to
// discard_events.

// REQUIRES: aspect-usm_shared_allocations
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I right that we can't rewrite discard event test cases to use buffer & accessor instead because the latter introduces dependencies in the execution graph that this test case doesn't want?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That could be a reason but for me what convinced me to not do it is that many functions in this test have the USM string in their names. Therefore, with limited knowledge of how this test is supposed to work, I decided to leave it as is.

#include <CL/sycl.hpp>
#include <cassert>
#include <iostream>
Expand Down
2 changes: 1 addition & 1 deletion sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %{build} -o %t.out

// RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt

// REQUIRES: aspect-usm_shared_allocations
// The test checks that the last parameter is `nullptr` for all PI calls that
// should discard events.
// {{0|0000000000000000}} is required for various output on Linux and Windows.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %{build} -o %t.out

// RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt

// REQUIRES: aspect-usm_shared_allocations
// The test checks that the last parameter is not `nullptr` for all PI calls
// that should discard events.
// {{0|0000000000000000}} is required for various output on Linux and Windows.
Expand Down
5 changes: 2 additions & 3 deletions sycl/test-e2e/DiscardEvents/invalid_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// https://github.com/intel/llvm/issues/7330.
// UNSUPPORTED: opencl && gpu
// RUN: %{build} -o %t.out

// RUN: %{run} %t.out

// The test checks that each PI call to the queue returns a discarded event
Expand All @@ -19,9 +18,9 @@ void QueueAPIsReturnDiscardedEvent(sycl::queue Q) {
sycl::range<1> range(BUFFER_SIZE);

auto Dev = Q.get_device();
int *x = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
int *x = sycl::malloc_device<int>(BUFFER_SIZE, Q);
assert(x != nullptr);
int *y = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
int *y = sycl::malloc_device<int>(BUFFER_SIZE, Q);
assert(y != nullptr);

sycl::event DiscardedEvent;
Expand Down
55 changes: 30 additions & 25 deletions sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
#include <iostream>
#include <sycl/sycl.hpp>

template <typename T>
sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
T *out_data, int num_wg, int group_size) {
template <typename T, typename AccessorT>
sycl::event compiler_group_scan_impl(sycl::queue *queue, AccessorT &in_data,
AccessorT &out_data, int num_wg,
int group_size) {
sycl::nd_range<1> thread_range(num_wg * group_size, group_size);
sycl::event event = queue->submit([&](sycl::handler &cgh) {
cgh.require(in_data);
cgh.require(out_data);
cgh.parallel_for(thread_range, [=](sycl::nd_item<1> item) {
auto id = item.get_global_linear_id();
auto group = item.get_group();
Expand All @@ -27,33 +30,35 @@ sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
return event;
}

template <typename T>
void test_compiler_group_scan(sycl::queue *queue, T *in_data, T *out_data,
int num_wg, int group_size) {
compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size);
template <typename T, typename AccessorT>
void test_compiler_group_scan(sycl::queue *queue, AccessorT &in_data,
AccessorT &out_data, int num_wg, int group_size) {
compiler_group_scan_impl<T>(queue, in_data, out_data, num_wg, group_size);
}

int main(int argc, const char **argv) {
int num_wg = 1;
int group_size = 16;
constexpr int num_wg = 1;
constexpr int group_size = 16;

sycl::queue queue;

typedef int T;
size_t nelems = num_wg * group_size;
T *data = sycl::malloc_shared<T>(nelems, queue);
T *result = sycl::malloc_shared<T>(nelems, queue);
queue.fill<T>(data, T(2), nelems).wait();
queue.memset(result, 0, nelems * sizeof(T)).wait();

test_compiler_group_scan(&queue, data, result, num_wg, group_size);
queue.wait();
T expected[] = {1, 2, 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
for (int i = 0; i < sizeof(expected) / sizeof(T); ++i) {
assert(result[i] == expected[i]);
constexpr size_t nelems = num_wg * group_size;
int data[nelems];
int result[nelems];
for (size_t i = 0; i < nelems; ++i) {
data[i] = 2;
result[i] = 0;
}
sycl::buffer<int> data_buf{&data[0], sycl::range{nelems}};
sycl::buffer<int> result_buf{&result[0], sycl::range{nelems}};
sycl::accessor data_acc{data_buf};
sycl::accessor result_acc{result_buf};
test_compiler_group_scan<int>(&queue, data_acc, result_acc, num_wg,
group_size);
sycl::host_accessor result_host{result_buf};
int expected[] = {1, 2, 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
for (int i = 0; i < sizeof(expected) / sizeof(int); ++i) {
assert(result_host[i] == expected[i]);
}
sycl::free(data, queue);
sycl::free(result, queue);
return 0;
}
97 changes: 50 additions & 47 deletions sycl/test-e2e/GroupAlgorithm/root_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,29 @@ void testRootGroup() {
max_num_work_group_sync>(q);
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};

int *data = sycl::malloc_shared<int>(maxWGs * WorkGroupSize, q);
sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
q.parallel_for<class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
auto root = it.ext_oneapi_get_root_group();
data[root.get_local_id()] = root.get_local_id();
sycl::group_barrier(root);
q.submit([&](sycl::handler &h) {
sycl::accessor data{dataBuf, h};
h.parallel_for<class RootGroupKernel>(
range, props, [=](sycl::nd_item<1> it) {
auto root = it.ext_oneapi_get_root_group();
data[root.get_local_id()] = root.get_local_id();
sycl::group_barrier(root);

root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
int sum = data[root.get_local_id()] +
data[root.get_local_range() - root.get_local_id() - 1];
sycl::group_barrier(root);
data[root.get_local_id()] = sum;
root =
sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
int sum = data[root.get_local_id()] +
data[root.get_local_range() - root.get_local_id() - 1];
sycl::group_barrier(root);
data[root.get_local_id()] = sum;
});
});
q.wait();

sycl::host_accessor data{dataBuf};
const int workItemCount = static_cast<int>(range.get_global_range().size());
for (int i = 0; i < workItemCount; i++) {
assert(data[i] == (workItemCount - 1));
}
sycl::free(data, q);
}

void testRootGroupFunctions() {
Expand All @@ -76,44 +78,45 @@ void testRootGroupFunctions() {
sycl::ext::oneapi::experimental::use_root_sync};

constexpr int testCount = 10;
bool *testResults = sycl::malloc_shared<bool>(testCount, q);
sycl::buffer<bool> testResultsBuf{sycl::range{testCount}};
const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
q.parallel_for<class RootGroupFunctionsKernel>(
range, props, [=](sycl::nd_item<1> it) {
const auto root = it.ext_oneapi_get_root_group();
if (root.leader() || root.get_local_id() == 3) {
testResults[0] = root.get_group_id() == sycl::id<1>(0);
testResults[1] = root.leader()
? root.get_local_id() == sycl::id<1>(0)
: root.get_local_id() == sycl::id<1>(3);
testResults[2] = root.get_group_range() == sycl::range<1>(1);
testResults[3] =
root.get_local_range() == sycl::range<1>(WorkGroupSize);
testResults[4] =
root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
testResults[5] = root.get_group_linear_id() == 0;
testResults[6] =
root.get_local_linear_id() == root.get_local_id().get(0);
testResults[7] = root.get_group_linear_range() == 1;
testResults[8] = root.get_local_linear_range() == WorkGroupSize;

const auto child =
sycl::ext::oneapi::experimental::get_child_group(root);
const auto grandchild =
sycl::ext::oneapi::experimental::get_child_group(child);
testResults[9] = child == it.get_group();
static_assert(
std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
sycl::sub_group>,
"get_child_group(sycl::group) must return a sycl::sub_group");
}
});
q.wait();
q.submit([&](sycl::handler &h) {
sycl::accessor testResults{testResultsBuf, h};
h.parallel_for<class RootGroupFunctionsKernel>(
range, props, [=](sycl::nd_item<1> it) {
const auto root = it.ext_oneapi_get_root_group();
if (root.leader() || root.get_local_id() == 3) {
testResults[0] = root.get_group_id() == sycl::id<1>(0);
testResults[1] = root.leader()
? root.get_local_id() == sycl::id<1>(0)
: root.get_local_id() == sycl::id<1>(3);
testResults[2] = root.get_group_range() == sycl::range<1>(1);
testResults[3] =
root.get_local_range() == sycl::range<1>(WorkGroupSize);
testResults[4] =
root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
testResults[5] = root.get_group_linear_id() == 0;
testResults[6] =
root.get_local_linear_id() == root.get_local_id().get(0);
testResults[7] = root.get_group_linear_range() == 1;
testResults[8] = root.get_local_linear_range() == WorkGroupSize;

const auto child =
sycl::ext::oneapi::experimental::get_child_group(root);
const auto grandchild =
sycl::ext::oneapi::experimental::get_child_group(child);
testResults[9] = child == it.get_group();
static_assert(
std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
sycl::sub_group>,
"get_child_group(sycl::group) must return a sycl::sub_group");
}
});
});
sycl::host_accessor testResults{testResultsBuf};
for (int i = 0; i < testCount; i++) {
assert(testResults[i]);
}
sycl::free(testResults, q);
}

int main() {
Expand Down
3 changes: 2 additions & 1 deletion sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ int main() {
Failed += Check(Q, "host_task", [&]() {
return Q.submit([&](sycl::handler &CGH) { CGH.host_task([]() {}); });
});

if (!Q.get_device().has(sycl::aspect::usm_shared_allocations))
return Failed;
constexpr size_t N = 64;
int *Data1 = sycl::malloc_shared<int>(N, Q);
int *Data2 = sycl::malloc_shared<int>(N, Q);
Expand Down
23 changes: 13 additions & 10 deletions sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,26 @@ int main() {
sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};

int *DevData = sycl::malloc_shared<int>(N, Dev, Ctx);
sycl::buffer<int> DevDataBuf{sycl::range{N}};
sycl::accessor DevData{DevDataBuf};
int *HostData = (int *)malloc(N * sizeof(int) * 10);

for (size_t I = 0; I < 10; ++I) {
Q1.fill<int>(DevData, 0, N);
sycl::event E1 = Q1.parallel_for(
N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
Q1.fill(DevData, 0);
sycl::event E1 = Q1.submit([&](sycl::handler &h) {
h.require(DevData);
h.parallel_for(
N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
});

Q2.ext_oneapi_set_external_event(E1);
sycl::event E2 =
Q2.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
sycl::event E2 = Q2.submit([&](sycl::handler &h) {
h.require(DevData);
h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
});

Q1.ext_oneapi_set_external_event(E2);
Q1.copy(DevData, HostData + N * I, N);
Q1.copy(DevData, HostData + N * I);
}

Q1.wait_and_throw();
Expand All @@ -46,9 +52,6 @@ int main() {
}
}
}

sycl::free(DevData, Ctx);
free(HostData);

return Failures;
}
2 changes: 1 addition & 1 deletion sycl/test-e2e/KernelAndProgram/disable-caching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ int main() {
// CHECK-CACHE: piKernelRelease
// CHECK-CACHE: piProgramRelease
// CHECK-CACHE: piEventsWait
auto *p = malloc_shared<int>(1, q);
auto *p = malloc_device<int>(1, q);
for (int i = 0; i < 2; ++i)
q.submit([&](handler &cgh) {
cgh.set_specialization_constant<spec_id>(i);
Expand Down
34 changes: 19 additions & 15 deletions sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,21 @@ int main() {
queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};

int *in1 = sycl::malloc_shared<int>(dataSize, q1);
int *in2 = sycl::malloc_shared<int>(dataSize, q1);
int *in3 = sycl::malloc_shared<int>(dataSize, q1);
int *tmp = sycl::malloc_shared<int>(dataSize, q1);
int *out = sycl::malloc_shared<int>(dataSize, q1);

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}
int *in1 = sycl::malloc_device<int>(dataSize, q1);
int *in2 = sycl::malloc_device<int>(dataSize, q1);
int *in3 = sycl::malloc_device<int>(dataSize, q1);
int *tmp = sycl::malloc_device<int>(dataSize, q1);
int *out = sycl::malloc_device<int>(dataSize, q1);

q1.single_task<class InitKernel>([=]() {
for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}
}).wait();

ext::codeplay::experimental::fusion_wrapper fw1{q1};
fw1.start_fusion();
Expand Down Expand Up @@ -71,10 +73,12 @@ int main() {

q1.wait();
q2.wait();

int host_out[dataSize];
q1.memcpy(host_out, out, dataSize * sizeof(int));
q1.wait();
// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (40 * i * i) && "Computation error");
assert(host_out[i] == (40 * i * i) && "Computation error");
}
sycl::free(in1, q1);
sycl::free(in2, q1);
Expand Down
Loading
Loading