Skip to content

Commit

Permalink
[SYCL] Rewrite tests that fail when usm_shared_allocations not suppor…
Browse files Browse the repository at this point in the history
…ted #2 (intel#12655)

Continuation of intel#12636. 
Refer to it for a description.
  • Loading branch information
lbushi25 authored Feb 12, 2024
1 parent 3ebffba commit 6639e78
Show file tree
Hide file tree
Showing 21 changed files with 237 additions and 206 deletions.
2 changes: 1 addition & 1 deletion sycl/test-e2e/DiscardEvents/discard_events_mixed_calls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
// the tests, please check if they pass without the discard_events property, if
// they don't pass then it's most likely a general issue unrelated to
// discard_events.

// REQUIRES: aspect-usm_shared_allocations
#include <CL/sycl.hpp>
#include <cassert>
#include <iostream>
Expand Down
2 changes: 1 addition & 1 deletion sycl/test-e2e/DiscardEvents/discard_events_usm.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %{build} -o %t.out

// RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt

// REQUIRES: aspect-usm_shared_allocations
// The test checks that the last parameter is `nullptr` for all PI calls that
// should discard events.
// {{0|0000000000000000}} is required for various output on Linux and Windows.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %{build} -o %t.out

// RUN: env SYCL_PI_TRACE=2 %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt

// REQUIRES: aspect-usm_shared_allocations
// The test checks that the last parameter is not `nullptr` for all PI calls
// that should discard events.
// {{0|0000000000000000}} is required for various output on Linux and Windows.
Expand Down
5 changes: 2 additions & 3 deletions sycl/test-e2e/DiscardEvents/invalid_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// https://github.com/intel/llvm/issues/7330.
// UNSUPPORTED: opencl && gpu
// RUN: %{build} -o %t.out

// RUN: %{run} %t.out

// The test checks that each PI call to the queue returns a discarded event
Expand All @@ -19,9 +18,9 @@ void QueueAPIsReturnDiscardedEvent(sycl::queue Q) {
sycl::range<1> range(BUFFER_SIZE);

auto Dev = Q.get_device();
int *x = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
int *x = sycl::malloc_device<int>(BUFFER_SIZE, Q);
assert(x != nullptr);
int *y = sycl::malloc_shared<int>(BUFFER_SIZE, Q);
int *y = sycl::malloc_device<int>(BUFFER_SIZE, Q);
assert(y != nullptr);

sycl::event DiscardedEvent;
Expand Down
55 changes: 30 additions & 25 deletions sycl/test-e2e/GroupAlgorithm/exclusive_scan_over_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@
#include <iostream>
#include <sycl/sycl.hpp>

template <typename T>
sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
T *out_data, int num_wg, int group_size) {
template <typename T, typename AccessorT>
sycl::event compiler_group_scan_impl(sycl::queue *queue, AccessorT &in_data,
AccessorT &out_data, int num_wg,
int group_size) {
sycl::nd_range<1> thread_range(num_wg * group_size, group_size);
sycl::event event = queue->submit([&](sycl::handler &cgh) {
cgh.require(in_data);
cgh.require(out_data);
cgh.parallel_for(thread_range, [=](sycl::nd_item<1> item) {
auto id = item.get_global_linear_id();
auto group = item.get_group();
Expand All @@ -27,33 +30,35 @@ sycl::event compiler_group_scan_impl(sycl::queue *queue, T *in_data,
return event;
}

template <typename T>
void test_compiler_group_scan(sycl::queue *queue, T *in_data, T *out_data,
int num_wg, int group_size) {
compiler_group_scan_impl(queue, in_data, out_data, num_wg, group_size);
template <typename T, typename AccessorT>
void test_compiler_group_scan(sycl::queue *queue, AccessorT &in_data,
AccessorT &out_data, int num_wg, int group_size) {
compiler_group_scan_impl<T>(queue, in_data, out_data, num_wg, group_size);
}

int main(int argc, const char **argv) {
int num_wg = 1;
int group_size = 16;
constexpr int num_wg = 1;
constexpr int group_size = 16;

sycl::queue queue;

typedef int T;
size_t nelems = num_wg * group_size;
T *data = sycl::malloc_shared<T>(nelems, queue);
T *result = sycl::malloc_shared<T>(nelems, queue);
queue.fill<T>(data, T(2), nelems).wait();
queue.memset(result, 0, nelems * sizeof(T)).wait();

test_compiler_group_scan(&queue, data, result, num_wg, group_size);
queue.wait();
T expected[] = {1, 2, 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
for (int i = 0; i < sizeof(expected) / sizeof(T); ++i) {
assert(result[i] == expected[i]);
constexpr size_t nelems = num_wg * group_size;
int data[nelems];
int result[nelems];
for (size_t i = 0; i < nelems; ++i) {
data[i] = 2;
result[i] = 0;
}
sycl::buffer<int> data_buf{&data[0], sycl::range{nelems}};
sycl::buffer<int> result_buf{&result[0], sycl::range{nelems}};
sycl::accessor data_acc{data_buf};
sycl::accessor result_acc{result_buf};
test_compiler_group_scan<int>(&queue, data_acc, result_acc, num_wg,
group_size);
sycl::host_accessor result_host{result_buf};
int expected[] = {1, 2, 4, 8, 16, 32, 64, 128,
256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
for (int i = 0; i < sizeof(expected) / sizeof(int); ++i) {
assert(result_host[i] == expected[i]);
}
sycl::free(data, queue);
sycl::free(result, queue);
return 0;
}
97 changes: 50 additions & 47 deletions sycl/test-e2e/GroupAlgorithm/root_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,29 @@ void testRootGroup() {
max_num_work_group_sync>(q);
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};

int *data = sycl::malloc_shared<int>(maxWGs * WorkGroupSize, q);
sycl::buffer<int> dataBuf{sycl::range{maxWGs * WorkGroupSize}};
const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
q.parallel_for<class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
auto root = it.ext_oneapi_get_root_group();
data[root.get_local_id()] = root.get_local_id();
sycl::group_barrier(root);
q.submit([&](sycl::handler &h) {
sycl::accessor data{dataBuf, h};
h.parallel_for<class RootGroupKernel>(
range, props, [=](sycl::nd_item<1> it) {
auto root = it.ext_oneapi_get_root_group();
data[root.get_local_id()] = root.get_local_id();
sycl::group_barrier(root);

root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
int sum = data[root.get_local_id()] +
data[root.get_local_range() - root.get_local_id() - 1];
sycl::group_barrier(root);
data[root.get_local_id()] = sum;
root =
sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
int sum = data[root.get_local_id()] +
data[root.get_local_range() - root.get_local_id() - 1];
sycl::group_barrier(root);
data[root.get_local_id()] = sum;
});
});
q.wait();

sycl::host_accessor data{dataBuf};
const int workItemCount = static_cast<int>(range.get_global_range().size());
for (int i = 0; i < workItemCount; i++) {
assert(data[i] == (workItemCount - 1));
}
sycl::free(data, q);
}

void testRootGroupFunctions() {
Expand All @@ -76,44 +78,45 @@ void testRootGroupFunctions() {
sycl::ext::oneapi::experimental::use_root_sync};

constexpr int testCount = 10;
bool *testResults = sycl::malloc_shared<bool>(testCount, q);
sycl::buffer<bool> testResultsBuf{sycl::range{testCount}};
const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
q.parallel_for<class RootGroupFunctionsKernel>(
range, props, [=](sycl::nd_item<1> it) {
const auto root = it.ext_oneapi_get_root_group();
if (root.leader() || root.get_local_id() == 3) {
testResults[0] = root.get_group_id() == sycl::id<1>(0);
testResults[1] = root.leader()
? root.get_local_id() == sycl::id<1>(0)
: root.get_local_id() == sycl::id<1>(3);
testResults[2] = root.get_group_range() == sycl::range<1>(1);
testResults[3] =
root.get_local_range() == sycl::range<1>(WorkGroupSize);
testResults[4] =
root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
testResults[5] = root.get_group_linear_id() == 0;
testResults[6] =
root.get_local_linear_id() == root.get_local_id().get(0);
testResults[7] = root.get_group_linear_range() == 1;
testResults[8] = root.get_local_linear_range() == WorkGroupSize;

const auto child =
sycl::ext::oneapi::experimental::get_child_group(root);
const auto grandchild =
sycl::ext::oneapi::experimental::get_child_group(child);
testResults[9] = child == it.get_group();
static_assert(
std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
sycl::sub_group>,
"get_child_group(sycl::group) must return a sycl::sub_group");
}
});
q.wait();
q.submit([&](sycl::handler &h) {
sycl::accessor testResults{testResultsBuf, h};
h.parallel_for<class RootGroupFunctionsKernel>(
range, props, [=](sycl::nd_item<1> it) {
const auto root = it.ext_oneapi_get_root_group();
if (root.leader() || root.get_local_id() == 3) {
testResults[0] = root.get_group_id() == sycl::id<1>(0);
testResults[1] = root.leader()
? root.get_local_id() == sycl::id<1>(0)
: root.get_local_id() == sycl::id<1>(3);
testResults[2] = root.get_group_range() == sycl::range<1>(1);
testResults[3] =
root.get_local_range() == sycl::range<1>(WorkGroupSize);
testResults[4] =
root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
testResults[5] = root.get_group_linear_id() == 0;
testResults[6] =
root.get_local_linear_id() == root.get_local_id().get(0);
testResults[7] = root.get_group_linear_range() == 1;
testResults[8] = root.get_local_linear_range() == WorkGroupSize;

const auto child =
sycl::ext::oneapi::experimental::get_child_group(root);
const auto grandchild =
sycl::ext::oneapi::experimental::get_child_group(child);
testResults[9] = child == it.get_group();
static_assert(
std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
sycl::sub_group>,
"get_child_group(sycl::group) must return a sycl::sub_group");
}
});
});
sycl::host_accessor testResults{testResultsBuf};
for (int i = 0; i < testCount; i++) {
assert(testResults[i]);
}
sycl::free(testResults, q);
}

int main() {
Expand Down
3 changes: 2 additions & 1 deletion sycl/test-e2e/InOrderEventsExt/get_last_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ int main() {
Failed += Check(Q, "host_task", [&]() {
return Q.submit([&](sycl::handler &CGH) { CGH.host_task([]() {}); });
});

if (!Q.get_device().has(sycl::aspect::usm_shared_allocations))
return Failed;
constexpr size_t N = 64;
int *Data1 = sycl::malloc_shared<int>(N, Q);
int *Data2 = sycl::malloc_shared<int>(N, Q);
Expand Down
23 changes: 13 additions & 10 deletions sycl/test-e2e/InOrderEventsExt/set_external_event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,26 @@ int main() {
sycl::queue Q1{Ctx, Dev, {sycl::property::queue::in_order{}}};
sycl::queue Q2{Ctx, Dev, {sycl::property::queue::in_order{}}};

int *DevData = sycl::malloc_shared<int>(N, Dev, Ctx);
sycl::buffer<int> DevDataBuf{sycl::range{N}};
sycl::accessor DevData{DevDataBuf};
int *HostData = (int *)malloc(N * sizeof(int) * 10);

for (size_t I = 0; I < 10; ++I) {
Q1.fill<int>(DevData, 0, N);
sycl::event E1 = Q1.parallel_for(
N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
Q1.fill(DevData, 0);
sycl::event E1 = Q1.submit([&](sycl::handler &h) {
h.require(DevData);
h.parallel_for(
N, [=](sycl::item<1> Idx) { DevData[Idx] = 42 + Idx[0] + I; });
});

Q2.ext_oneapi_set_external_event(E1);
sycl::event E2 =
Q2.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
sycl::event E2 = Q2.submit([&](sycl::handler &h) {
h.require(DevData);
h.parallel_for(N, [=](sycl::item<1> Idx) { ++DevData[Idx]; });
});

Q1.ext_oneapi_set_external_event(E2);
Q1.copy(DevData, HostData + N * I, N);
Q1.copy(DevData, HostData + N * I);
}

Q1.wait_and_throw();
Expand All @@ -46,9 +52,6 @@ int main() {
}
}
}

sycl::free(DevData, Ctx);
free(HostData);

return Failures;
}
2 changes: 1 addition & 1 deletion sycl/test-e2e/KernelAndProgram/disable-caching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ int main() {
// CHECK-CACHE: piKernelRelease
// CHECK-CACHE: piProgramRelease
// CHECK-CACHE: piEventsWait
auto *p = malloc_shared<int>(1, q);
auto *p = malloc_device<int>(1, q);
for (int i = 0; i < 2; ++i)
q.submit([&](handler &cgh) {
cgh.set_specialization_constant<spec_id>(i);
Expand Down
34 changes: 19 additions & 15 deletions sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,21 @@ int main() {
queue q1{ext::codeplay::experimental::property::queue::enable_fusion{}};
queue q2{ext::codeplay::experimental::property::queue::enable_fusion{}};

int *in1 = sycl::malloc_shared<int>(dataSize, q1);
int *in2 = sycl::malloc_shared<int>(dataSize, q1);
int *in3 = sycl::malloc_shared<int>(dataSize, q1);
int *tmp = sycl::malloc_shared<int>(dataSize, q1);
int *out = sycl::malloc_shared<int>(dataSize, q1);

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}
int *in1 = sycl::malloc_device<int>(dataSize, q1);
int *in2 = sycl::malloc_device<int>(dataSize, q1);
int *in3 = sycl::malloc_device<int>(dataSize, q1);
int *tmp = sycl::malloc_device<int>(dataSize, q1);
int *out = sycl::malloc_device<int>(dataSize, q1);

q1.single_task<class InitKernel>([=]() {
for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}
}).wait();

ext::codeplay::experimental::fusion_wrapper fw1{q1};
fw1.start_fusion();
Expand Down Expand Up @@ -71,10 +73,12 @@ int main() {

q1.wait();
q2.wait();

int host_out[dataSize];
q1.memcpy(host_out, out, dataSize * sizeof(int));
q1.wait();
// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (40 * i * i) && "Computation error");
assert(host_out[i] == (40 * i * i) && "Computation error");
}
sycl::free(in1, q1);
sycl::free(in2, q1);
Expand Down
Loading

0 comments on commit 6639e78

Please sign in to comment.