Skip to content

Commit

Permalink
[SYCL][COMPAT] Add experimental launch overloads accepting SG size (#…
Browse files Browse the repository at this point in the history
…13767)

add experimental launch overloads accepting SG size in SYCL
  • Loading branch information
AD2605 committed May 15, 2024
1 parent c173fbf commit ebdae02
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 0 deletions.
33 changes: 33 additions & 0 deletions sycl/doc/syclcompat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,39 @@ This `launch` interface allows users to define an internal memory pool, or
scratchpad, that can then be reinterpreted as the datatype required by the user
within the kernel function.
To launch a kernel with a specified sub-group size, overloads similar to above `launch`
functions are present in the `syclcompat::experimental` namespace, which accept SubgroupSize
as a template parameter and can be called as `launch<Function, SubgroupSize>`
```cpp
template <auto F, int SubgroupSize, typename... Args>
sycl::event launch(sycl::nd_range<3> launch_range, std::size_t local_memory_size,
sycl::queue queue, Args... args);
template <auto F, int SubgroupSize, typename... Args>
sycl::event launch(sycl::nd_range<Dim> launch_range, std::size_t local_memory_size,
Args... args);
template <auto F, int SubgroupSize, typename... Args>
sycl::event launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim,
std::size_t local_memory_size, Args... args);
template <auto F, int SubgroupSize, typename... Args>
sycl::event launch(sycl::nd_range<3> launch_range, sycl::queue queue,
Args... args);
template <auto F, int SubgroupSize, typename... Args>
sycl::event launch(sycl::nd_range<Dim> launch_range,
Args... args);
template <auto F, int SubgroupSize, typename... Args>
sycl::event launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim,
Args... args);
```

### Utilities

SYCLcompat introduces a set of utility functions designed to streamline the
Expand Down
105 changes: 105 additions & 0 deletions sycl/include/syclcompat/launch_experimental.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/***************************************************************************
*
* Copyright (C) Codeplay Software Ltd.
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM
* Exceptions. See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SYCLcompat
*
* launch_experimental.hpp
*
* Description:
* Launch Overloads with accepting required subgroup size
**************************************************************************/

#pragma once

#include <syclcompat/device.hpp>
#include <syclcompat/dims.hpp>
#include <syclcompat/launch.hpp>

namespace syclcompat {
namespace experimental {

//================================================================================================//
// Overloads using Local Memory //
//================================================================================================//

template <auto F, int SubgroupSize, typename... Args>
std::enable_if_t<std::is_invocable_v<decltype(F), Args..., char *>, sycl::event>
launch(sycl::nd_range<3> launch_range, std::size_t local_memory_size,
sycl::queue queue, Args... args) {
return queue.submit([&](sycl::handler &cgh) {
sycl::local_accessor<char, 1> loc(local_memory_size, cgh);
cgh.parallel_for(
launch_range,
[=](sycl::nd_item<3> it) [[sycl::reqd_sub_group_size(SubgroupSize)]] {
[[clang::always_inline]] F(
args..., loc.get_multi_ptr<sycl::access::decorated::yes>());
});
});
}

template <auto F, int SubgroupSize, int Dim, typename... Args>
std::enable_if_t<std::is_invocable_v<decltype(F), Args..., char *>, sycl::event>
launch(sycl::nd_range<Dim> launch_range, std::size_t local_memory_size,
Args... args) {
return launch<F, SubgroupSize, Args...>(
::syclcompat::detail::transform_nd_range(launch_range), local_memory_size,
::syclcompat::get_default_queue(), args...);
}

template <auto F, int SubgroupSize, typename... Args>
std::enable_if_t<std::is_invocable_v<decltype(F), Args..., char *>, sycl::event>
launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim,
std::size_t local_memory_size, Args... args) {
return launch<F, SubgroupSize, Args...>(
::syclcompat::detail::transform_nd_range(sycl::nd_range(
sycl::range<3>(grid_dim * block_dim), sycl::range<3>(block_dim))),
local_memory_size, ::syclcompat::get_default_queue(), args...);
}

//================================================================================================//
// Overloads not using Local Memory //
//================================================================================================//

template <auto F, int SubgroupSize, typename... Args>
std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
launch(sycl::nd_range<3> launch_range, sycl::queue queue, Args... args) {
return queue.submit([&](sycl::handler &cgh) {
cgh.parallel_for(launch_range,
[=](sycl::nd_item<3> it)
[[sycl::reqd_sub_group_size(SubgroupSize)]] {
[[clang::always_inline]] F(args...);
});
});
}

template <auto F, int SubgroupSize, int Dim, typename... Args>
std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
launch(sycl::nd_range<Dim> launch_range, Args... args) {
return launch<F, SubgroupSize, Args...>(
::syclcompat::detail::transform_nd_range(launch_range),
::syclcompat::get_default_queue(), args...);
}

template <auto F, int SubgroupSize, typename... Args>
std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
launch(::syclcompat::dim3 grid_dim, ::syclcompat::dim3 block_dim,
Args... args) {
return launch<F, SubgroupSize, Args...>(
::syclcompat::detail::transform_nd_range(sycl::nd_range(
sycl::range<3>(grid_dim * block_dim), sycl::range<3>(block_dim))),
::syclcompat::get_default_queue(), args...);
}

} // namespace experimental
} // namespace syclcompat
1 change: 1 addition & 0 deletions sycl/include/syclcompat/syclcompat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <syclcompat/id_query.hpp>
#include <syclcompat/kernel.hpp>
#include <syclcompat/launch.hpp>
#include <syclcompat/launch_experimental.hpp>
#include <syclcompat/math.hpp>
#include <syclcompat/memory.hpp>
#include <syclcompat/util.hpp>
199 changes: 199 additions & 0 deletions sycl/test-e2e/syclcompat/launch/launch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
#include <sycl/sycl.hpp>

#include <syclcompat/device.hpp>
#include <syclcompat/id_query.hpp>
#include <syclcompat/launch.hpp>
#include <syclcompat/launch_experimental.hpp>
#include <syclcompat/memory.hpp>

#include "../common.hpp"
Expand Down Expand Up @@ -60,6 +62,51 @@ void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
}
};

template <typename T>
void reqd_sg_size_kernel(int modifier_val, int num_elements, T *data) {

const int id = sycl::ext::oneapi::this_work_item::get_nd_item<3>()
.get_global_linear_id();
const int sg_size = sycl::ext::oneapi::this_work_item::get_nd_item<3>()
.get_sub_group()
.get_local_linear_range();
if (id < num_elements) {
if (id < num_elements - modifier_val) {
data[id] = static_cast<T>(
(id + modifier_val - sg_size) < 0 ? 0 : id + modifier_val - sg_size);
} else {
data[id] = static_cast<T>(id + modifier_val + sg_size);
}
}
}

template <typename T>
void reqd_sg_size_kernel_with_local_memory(int modifier_val, int num_elements,
T *data, char *local_mem) {
T *typed_local_mem = reinterpret_cast<T *>(local_mem);
const int id = sycl::ext::oneapi::this_work_item::get_nd_item<3>()
.get_global_linear_id();
const int sg_size = sycl::ext::oneapi::this_work_item::get_nd_item<3>()
.get_sub_group()
.get_local_linear_range();

const int wi_id_in_wg =
sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_local_linear_id();

if (id < num_elements - modifier_val) {
typed_local_mem[wi_id_in_wg] = static_cast<T>(
(id + modifier_val - sg_size) < 0 ? 0 : id + modifier_val - sg_size);
} else {
typed_local_mem[wi_id_in_wg] = static_cast<T>(id + modifier_val + sg_size);
}

syclcompat::wg_barrier();

if (id < num_elements) {
data[id] = typed_local_mem[wi_id_in_wg];
}
}

template <int Dim>
void compute_nd_range_3d(RangeParams<Dim> range_param, std::string test_name) {
std::cout << __PRETTY_FUNCTION__ << " " << test_name << std::endl;
Expand Down Expand Up @@ -326,6 +373,154 @@ template <typename T> void test_memsize_no_arg_launch_q() {
memsize, lt.q_);
}

template <typename T> void test_reqd_sg_size() {
namespace syclc_exp = syclcompat::experimental;

std::cout << __PRETTY_FUNCTION__ << std::endl;

LaunchTestWithArgs<T> ltt;
if (ltt.skip_) // Unsupported aspect
return;

int SubgroupSize = 16;
const int modifier_val = 9;
const int num_elements = 1024;

T *h_a = (T *)syclcompat::malloc_host(num_elements * sizeof(T));
T *d_a = (T *)syclcompat::malloc(num_elements * sizeof(T));
auto sg_sizes = syclcompat::get_default_queue()
.get_device()
.get_info<sycl::info::device::sub_group_sizes>();

if (std::find(sg_sizes.begin(), sg_sizes.end(), 16) != sg_sizes.end()) {
syclc_exp::launch<reqd_sg_size_kernel<T>, 16>(
ltt.grid_, ltt.thread_, modifier_val, static_cast<int>(num_elements),
d_a);
} else {
SubgroupSize = 32;
syclc_exp::launch<reqd_sg_size_kernel<T>, 32>(
ltt.grid_, ltt.thread_, modifier_val, static_cast<int>(num_elements),
d_a);
}

syclcompat::wait_and_throw();
syclcompat::memcpy<T>(h_a, d_a, num_elements);
syclcompat::free(d_a);

for (int i = 0; i < static_cast<int>(num_elements); i++) {
T result;
if (i < (static_cast<int>(num_elements) - modifier_val)) {
result = static_cast<T>((i + modifier_val - SubgroupSize) < 0
? 0
: (i + modifier_val - SubgroupSize));
} else {
result = static_cast<T>(i + modifier_val + SubgroupSize);
}
assert(h_a[i] == result);
}

syclcompat::free(h_a);
}

template <typename T> void test_reqd_sg_size_q() {
namespace syclc_exp = syclcompat::experimental;
std::cout << __PRETTY_FUNCTION__ << std::endl;

LaunchTestWithArgs<T> ltt;
if (ltt.skip_) // Unsupported aspect
return;
int SubgroupSize = 16;
const int modifier_val = 9;
auto &q = ltt.in_order_q_;
const int num_elements = 1024;

T *h_a = (T *)syclcompat::malloc_host(num_elements * sizeof(T), q);
T *d_a = (T *)syclcompat::malloc(num_elements * sizeof(T), q);
sycl::nd_range<3> launch_range(sycl::range<3>(ltt.grid_ * ltt.thread_),
sycl::range<3>(ltt.thread_));
auto sg_sizes =
q.get_device().template get_info<sycl::info::device::sub_group_sizes>();
if (std::find(sg_sizes.begin(), sg_sizes.end(), 16) != sg_sizes.end()) {
syclc_exp::launch<reqd_sg_size_kernel<T>, 16>(
launch_range, q, modifier_val, static_cast<int>(num_elements), d_a);
} else {
SubgroupSize = 32;
syclc_exp::launch<reqd_sg_size_kernel<T>, 32>(
launch_range, q, modifier_val, static_cast<int>(num_elements), d_a);
}

syclcompat::wait_and_throw();
syclcompat::memcpy<T>(h_a, d_a, num_elements, q);
syclcompat::free(d_a, q);

for (int i = 0; i < static_cast<int>(num_elements); i++) {
T result;
if (i < (static_cast<int>(num_elements) - modifier_val)) {
result = static_cast<T>((i + modifier_val - SubgroupSize) < 0
? 0
: (i + modifier_val - SubgroupSize));
} else {
result = static_cast<T>(i + modifier_val + SubgroupSize);
}
assert(h_a[i] == result);
}
syclcompat::free(h_a, q);
}

template <typename T> void test_reqd_sg_size_with_local_memory() {
namespace syclc_exp = syclcompat::experimental;

std::cout << __PRETTY_FUNCTION__ << std::endl;

LaunchTestWithArgs<T> ltt;
if (ltt.skip_) // Unsupported aspect
return;

int SubgroupSize = 16;
const int modifier_val = 9;

std::size_t local_memory_size =
ltt.thread_.x * ltt.thread_.y * ltt.thread_.z * sizeof(T);
auto global_range = ltt.thread_ * ltt.grid_;

auto num_elements = global_range.x * global_range.y * global_range.z;

T *h_a = (T *)syclcompat::malloc_host(num_elements * sizeof(T));
T *d_a = (T *)syclcompat::malloc(num_elements * sizeof(T));

auto sg_sizes = syclcompat::get_default_queue()
.get_device()
.get_info<sycl::info::device::sub_group_sizes>();

if (std::find(sg_sizes.begin(), sg_sizes.end(), 16) != sg_sizes.end()) {
syclc_exp::launch<reqd_sg_size_kernel_with_local_memory<T>, 16>(
ltt.grid_, ltt.thread_, local_memory_size, modifier_val,
static_cast<int>(num_elements), d_a);
} else {
SubgroupSize = 32;
syclc_exp::launch<reqd_sg_size_kernel_with_local_memory<T>, 32>(
ltt.grid_, ltt.thread_, local_memory_size, modifier_val,
static_cast<int>(num_elements), d_a);
}

syclcompat::wait_and_throw();
syclcompat::memcpy<T>(h_a, d_a, num_elements);

for (int i = 0; i < static_cast<int>(num_elements); i++) {
T result;
if (i < (static_cast<int>(num_elements) - modifier_val)) {
result = static_cast<T>((i + modifier_val - SubgroupSize) < 0
? 0
: (i + modifier_val - SubgroupSize));
} else {
result = static_cast<T>(i + modifier_val + SubgroupSize);
}
assert(h_a[i] == result);
}
syclcompat::free(d_a);
syclcompat::free(h_a);
}

int main() {
test_launch_compute_nd_range_3d();
test_no_arg_launch();
Expand All @@ -345,5 +540,9 @@ int main() {
INSTANTIATE_ALL_TYPES(memsize_type_list, test_memsize_no_arg_launch);
INSTANTIATE_ALL_TYPES(memsize_type_list, test_memsize_no_arg_launch_q);

INSTANTIATE_ALL_TYPES(memsize_type_list, test_reqd_sg_size);
INSTANTIATE_ALL_TYPES(memsize_type_list, test_reqd_sg_size_q);
INSTANTIATE_ALL_TYPES(memsize_type_list, test_reqd_sg_size_with_local_memory);

return 0;
}

0 comments on commit ebdae02

Please sign in to comment.