Skip to content

Commit

Permalink
[SYCL] Do not internalize kernels when supporting dynamic linking (#1…
Browse files Browse the repository at this point in the history
…5307)

Do not internalize kernels when supporting dynamic linking. Kernels must
be visible so that host code can find them.

---------

Signed-off-by: Lu, John <john.lu@intel.com>
  • Loading branch information
LU-JOHN authored Sep 18, 2024
1 parent 464b077 commit 9e39902
Show file tree
Hide file tree
Showing 3 changed files with 293 additions and 6 deletions.
8 changes: 5 additions & 3 deletions llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,11 +667,13 @@ void ModuleDesc::restoreLinkageOfDirectInvokeSimdTargets() {
// the transformation safe.
static bool mustPreserveGV(const GlobalValue &GV) {
if (const Function *F = dyn_cast<Function>(&GV)) {
// When dynamic linking is supported, we internalize everything that can
// not be imported which also means that there is no point of having it
// When dynamic linking is supported, we internalize everything (except
// kernels which are the entry points from host code to device code) that
// cannot be imported which also means that there is no point of having it
// visible outside of the current module.
if (AllowDeviceImageDependencies)
return canBeImportedFunction(*F);
return F->getCallingConv() == CallingConv::SPIR_KERNEL ||
canBeImportedFunction(*F);

// Otherwise, we are being even more aggressive: SYCL modules are expected
// to be self-contained, meaning that they have no external dependencies.
Expand Down
9 changes: 6 additions & 3 deletions sycl/doc/design/SharedLibraries.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,9 @@ if the Function "can be imported". A `canBeImportedFunction` is:

1. Not an intrinsic
2. Name does not start with "__"
3. Demangled name does not start with "__"
4. Must be a `SYCL_EXTERNAL` function
3. Is not a SPIRV, SYCL, or ESIMD builtin function
4. Demangled name does not start with "__"
5. Must be a `SYCL_EXTERNAL` function

More information about `SYCL_EXTERNAL` can be found in:
https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:syclexternal
Expand All @@ -173,7 +174,9 @@ the following modifications:
function. Instead the dependency is recorded in the imported symbols property list.
- An image that provides a `canBeImportedFunction` has the symbol recorded in the exported
symbols property list.
- All functions symbols that are not `canBeImportedFunction` are internalized
- All functions symbols that are not `canBeImportedFunction` and are not kernels are internalized.
Note that kernel functions should not be included in `canBeImportedFunction` since kernels
are only callable by host code, and thus would never need to be imported into a device image.



Expand Down
282 changes: 282 additions & 0 deletions sycl/test-e2e/DeviceDependencies/free_function_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
// Ensure -fsycl-allow-device-dependencies can work with free function kernels.

// REQUIRES: aspect-usm_shared_allocations
// RUN: %{build} -o %t.out -fsycl-allow-device-dependencies
// RUN: %{run} %t.out

// The name mangling for free function kernels currently does not work with PTX.
// UNSUPPORTED: cuda

#include <iostream>
#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/free_function_queries.hpp>
#include <sycl/usm.hpp>

using namespace sycl;

void printUSM(int *usmPtr, int size) {
std::cout << "usmPtr[] = {";
for (int i = 0; i < size; i++) {
std::cout << usmPtr[i] << ", ";
}
std::cout << "}\n";
}

bool checkUSM(int *usmPtr, int size, int *Result) {
bool Pass = true;
for (int i = 0; i < size; i++) {
if (usmPtr[i] != Result[i]) {
Pass = false;
break;
}
}
if (Pass)
return true;

std::cout << "Expected = {";
for (int i = 0; i < size; i++) {
std::cout << Result[i] << ", ";
}
std::cout << "}\n";
std::cout << "Result = {";
for (int i = 0; i < size; i++) {
std::cout << usmPtr[i] << ", ";
}
std::cout << "}\n";
return false;
}

extern "C" SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::single_task_kernel)) void ff_0(int *ptr,
int start,
int end) {
for (int i = start; i <= end; i++)
ptr[i] = start + end;
}

bool test_0(queue Queue) {
constexpr int Range = 10;
int *usmPtr = malloc_shared<int>(Range, Queue);
int start = 3;
int end = 5;
int Result[Range] = {0, 0, 0, 8, 8, 8, 0, 0, 0, 0};
range<1> R1{Range};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.single_task([=]() {
for (int i = start; i <= end; i++)
usmPtr[i] = start + end;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 0a: " << (PassA ? "PASS" : "FAIL") << std::endl;

bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id = ext::oneapi::experimental::get_kernel_id<ff_0>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, start);
Handler.set_arg(2, end);
Handler.single_task(Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 0b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

// Overloaded free function definition.
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<1>))
void ff_1(int *ptr, int start, int end) {
nd_item<1> Item = ext::oneapi::this_work_item::get_nd_item<1>();
id<1> GId = Item.get_global_id();
ptr[GId.get(0)] = GId.get(0) + start + end;
}

bool test_1(queue Queue) {
constexpr int Range = 10;
int *usmPtr = malloc_shared<int>(Range, Queue);
int start = 3;
int Result[Range] = {13, 14, 15, 16, 17, 18, 19, 20, 21, 22};
nd_range<1> R1{{Range}, {1}};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.parallel_for(R1, [=](nd_item<1> Item) {
id<1> GId = Item.get_global_id();
usmPtr[GId.get(0)] = GId.get(0) + start + Range;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 1a: " << (PassA ? "PASS" : "FAIL") << std::endl;

bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id = ext::oneapi::experimental::get_kernel_id<(
void (*)(int *, int, int))ff_1>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, start);
Handler.set_arg(2, Range);
Handler.parallel_for(R1, Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 1b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

// Overloaded free function definition.
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<2>))
void ff_1(int *ptr, int start) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(ptr);
nd_item<2> Item = ext::oneapi::this_work_item::get_nd_item<2>();
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + start;
}

bool test_2(queue Queue) {
constexpr int Range = 16;
int *usmPtr = malloc_shared<int>(Range, Queue);
int value = 55;
int Result[Range] = {55, 56, 55, 56, 56, 57, 56, 57,
55, 56, 55, 56, 56, 57, 56, 57};
nd_range<2> R2{range<2>{4, 4}, range<2>{2, 2}};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.parallel_for(R2, [=](nd_item<2> Item) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(usmPtr);
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + value;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 2a: " << (PassA ? "PASS" : "FAIL") << std::endl;

bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id =
ext::oneapi::experimental::get_kernel_id<(void (*)(int *, int))ff_1>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, value);
Handler.parallel_for(R2, Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 2b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

// Templated free function definition.
template <typename T>
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<2>))
void ff_3(T *ptr, T start) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(ptr);
nd_item<2> Item = ext::oneapi::this_work_item::get_nd_item<2>();
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + start;
}

// Explicit instantiation with "int*".
template void ff_3(int *ptr, int start);

bool test_3(queue Queue) {
constexpr int Range = 16;
int *usmPtr = malloc_shared<int>(Range, Queue);
int value = 55;
int Result[Range] = {55, 56, 55, 56, 56, 57, 56, 57,
55, 56, 55, 56, 56, 57, 56, 57};
nd_range<2> R2{range<2>{4, 4}, range<2>{2, 2}};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.parallel_for(R2, [=](nd_item<2> Item) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(usmPtr);
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + value;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 3a: " << (PassA ? "PASS" : "FAIL") << std::endl;

bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id = ext::oneapi::experimental::get_kernel_id<(
void (*)(int *, int))ff_3<int>>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, value);
Handler.parallel_for(R2, Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 3b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

int main() {
queue Queue;

bool Pass = true;
Pass &= test_0(Queue);
Pass &= test_1(Queue);
Pass &= test_2(Queue);
Pass &= test_3(Queue);

return Pass ? 0 : 1;
}

0 comments on commit 9e39902

Please sign in to comment.