diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc index bb4e0f26aeaac..e509f687ecd1b 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc @@ -116,8 +116,8 @@ supports. === Defining a free function kernel A free function kernel is a normal C++ function definition, where the function -declaration is decorated with one of the following compile-time properties: -`range_kernel`, `nd_range_kernel`, or `single_task_kernel`. +declaration is decorated with either the `nd_range_kernel` compile-time +property or the `single_task_kernel` compile-time property. When a function declaration is decorated with one of these properties, the following rules must be observed: @@ -159,50 +159,6 @@ The following table provides additional details about these compile-time properties. |==== -a| -*`range_kernel` property* - -[frame=all,grid=none] -!==== -a! -[source] ----- -namespace sycl::ext::oneapi::experimental { - -struct range_kernel_key { - template - using value_t = property_value; -}; - -template -inline constexpr range_kernel_key::value_t range_kernel; - -template<> -struct is_property_key : std::true_type {}; - -} // namespace sycl::ext::oneapi::experimental ----- -!==== - -Indicates that the function is a free function kernel that is invoked with a -simple `range` iteration space of `Dims` dimensions. - -The `property_value` struct has the following member variables: - -[%header,cols="1,1"] -!==== -!Member -!Description - -a! -[source] ----- -static constexpr int dimensions = Dims ----- -! -The number of dimensions of the kernel's range. -!==== - a| *`nd_range_kernel` property* @@ -281,7 +237,7 @@ The following example demonstrates how a free function kernel using a 3-dimensional nd-range iteration space can be defined: ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::nd_range_kernel<3>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<3>)) void iota(float start, float *ptr) { // ... } @@ -305,31 +261,6 @@ a! ---- namespace sycl::ext::oneapi::experimental { -template -struct is_range_kernel; - -template -inline constexpr bool is_range_kernel_v = is_range_kernel::value; - -} // namespace sycl::ext::oneapi::experimental ----- -!==== - -If `Func` is the address of a function whose declaration is decorated with the -`range_kernel` property, the `is_range_kernel` trait provides -the member constant `value` equal to `true`. -Otherwise `value` is `false`. - -The helper trait `is_range_kernel_v` provides the value of `value`. - -a| -[frame=all,grid=none] -!==== -a! -[source] ----- -namespace sycl::ext::oneapi::experimental { - template struct is_nd_range_kernel; @@ -390,10 +321,9 @@ inline constexpr bool is_kernel_v = is_kernel::value; ---- !==== -If `Func` is the address of a function whose declaration is decorated with any -of the properties `range_kernel`, `nd_range_kernel`, or `single_task_kernel`; -the `is_kernel` trait provides the member constant `value` equal to -`true`. +If `Func` is the address of a function whose declaration is decorated with +either the `nd_range_kernel` property or the `single_task_kernel` property, the +`is_kernel` trait provides the member constant `value` equal to `true`. Otherwise `value` is `false`. The helper trait `is_kernel_v` provides the value of `value`. @@ -598,16 +528,14 @@ Once the application obtains a `kernel` object for a free function kernel, it can enqueue the kernel to a device using any of the SYCL functions that allow a kernel to be enqueued via a `kernel` object. The application must enqueue the free function kernel according to its type. -For example, a free function kernel defined via `range_kernel` can be enqueued -by calling the `handler::parallel_for` overload taking a `range`. -A free function kernel defined via `nd_range_kernel` can be enqueued by calling -the `handler::parallel_for` overload taking an `nd_range`. +For example, a free function kernel defined via `nd_range_kernel` can be +enqueued by calling the `handler::parallel_for` overload taking an `nd_range`. A free function kernel defined via `single_task_kernel` can be enqueued by calling `handler::single_task`. Attempting to enqueue a free function kernel using a mechanism that does not match its type results in undefined behavior. -Attempting to enqueue a free function kernel with a `range` or `nd_range` whose +Attempting to enqueue a free function kernel with an `nd_range` whose dimensionality does not match the free function kernel definition results in undefined behavior. @@ -659,15 +587,15 @@ sycl_ext_oneapi_kernel_properties] by applying the properties to the function declaration as illustrated below. ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::work_group_size<32>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::work_group_size<32>)) void iota(float start, float *ptr) { // ... } ``` -The kernel properties may appear either before or after the `range_kernel`, -`nd_range_kernel`, or `single_task_kernel` property. +The kernel properties may appear either before or after the `nd_range_kernel` +or `single_task_kernel` property. As with standard SYCL kernels, these kernel properties can be queried via `kernel::get_info` using either the `info::kernel::attributes` information @@ -700,26 +628,26 @@ enum myenum : int; // Each kernel is forward declared in the same namespace in which the // application declares it. -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void kernel1(int *); -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void kernel2(mystruct, myenum); template -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void kernel3(T *); namespace ns { -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void kernel4(int *); } ``` (The lines using `SYCL_EXT_ONEAPI_FUNCTION_PROPERTY` are exposition-only. Implementations will probably emit some implementation-specific code here -instead of using the macro because the macro and the `range_kernel` property +instead of using the macro because the macro and the `nd_range_kernel` property are probably defined in the `` header, which does not get -get included until after the integration header.) +included until after the integration header.) As a result, these implementations impose additional restrictions for functions that are declared as free function kernels: @@ -745,7 +673,7 @@ void caller() { // in the integration header } -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void foo(float) {/*...*/} ``` @@ -768,14 +696,16 @@ enqueue it on a device. ``` #include -namespace syclex = sycl::ext::oneapi::experimental; +namespace syclext = sycl::ext::oneapi; +namespace syclexp = sycl::ext::oneapi::experimental; static constexpr size_t NUM = 1024; +static constexpr size_t WGSIZE = 16; -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void iota(float start, float *ptr) { // Get the ID of this kernel iteration. - size_t id = syclex::this_kernel::get_id(); + size_t id = syclext::this_work_item::get_nd_item().get_global_linear_id(); ptr[id] = start + static_cast(id); } @@ -786,7 +716,7 @@ void main() { // Get a kernel bundle that contains the free function kernel "iota". auto exe_bndl = - syclex::get_kernel_bundle(ctxt); + syclexp::get_kernel_bundle(ctxt); // Get a kernel object for the "iota" function from that bundle. sycl::kernel k_iota = exe_bndl.ext_oneapi_get_kernel(); @@ -796,7 +726,8 @@ void main() { // Set the values of the kernel arguments. cgh.set_args(3.14f, ptr); - cgh.parallel_for({NUM}, k_iota); + sycl::nd_range ndr{{NUM}, {WGSIZE}}; + cgh.parallel_for(ndr, k_iota); }).wait(); } ``` @@ -810,20 +741,20 @@ cases. ``` #include -namespace syclex = sycl::ext::oneapi::experimental; +namespace syclexp = sycl::ext::oneapi::experimental; template -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void iota(T start, T *ptr) { // ... } -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::single_task_kernel)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::single_task_kernel)) void ping(float *x) { // ... } -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::single_task_kernel)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::single_task_kernel)) void ping(int *x) { // ... } @@ -831,13 +762,13 @@ void ping(int *x) { int main() { // When the free function kernel is templated, pass the address of a // specific instantiation. - sycl::kernel_id iota_float = syclex::get_kernel_id>(); - sycl::kernel_id iota_int = syclex::get_kernel_id>(); + sycl::kernel_id iota_float = syclexp::get_kernel_id>(); + sycl::kernel_id iota_int = syclexp::get_kernel_id>(); // When there are multiple overloads of a free function kernel, use a cast // to disambiguate. - sycl::kernel_id ping_float = syclex::get_kernel_id<(void(*)(float))ping>(); - sycl::kernel_id ping_int = syclex::get_kernel_id<(void(*)(int))ping>(); + sycl::kernel_id ping_float = syclexp::get_kernel_id<(void(*)(float))ping>(); + sycl::kernel_id ping_int = syclexp::get_kernel_id<(void(*)(int))ping>(); } ``` @@ -846,10 +777,10 @@ int main() { === Compiler diagnostics -My expectation is that {dpcpp} will emit a diagnostic if a function is -decorated as a free function kernel (e.g. via `syclex::range_kernel`) and the -function violates any of the restrictions listed above under "Defining a free -function kernel". +Our expectation is that {dpcpp} will emit a diagnostic if a function is +decorated as a free function kernel (e.g. via `syclexp::nd_range_kernel`) and +the function violates any of the restrictions listed above under "Defining a +free function kernel". (Except, of course, no diagnostic is required for violations of the last bullet because that cannot be diagnosed when compiling a single translation unit.) @@ -862,8 +793,8 @@ is defined as a static member function. === Integration header -My expectation is that {dpcpp} will use the integration header to implement the -traits and the queries like `get_kernel_id()`. +Our expectation is that {dpcpp} will use the integration header to implement +the traits and the queries like `get_kernel_id()`. The integration header will probably start with forward declarations of types used for the parameters to the free function kernels. Following this, the header can contain forward declarations of the free @@ -872,19 +803,19 @@ In order to avoid problems where functions with the same name in different namespaces "shadow" each other, the structure can look like this: ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void same_name(int arg1); static constexpr auto __sycl_shim1() {return (void(*)(int))same_name;} inline namespace { - SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) + SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void same_name(int arg1); static constexpr auto __sycl_shim2() {return (void(*)(int))same_name;} } namespace sycl { - template<> struct is_range_kernel<__sycl_shim1()> : std::true_type {}; - template<> struct is_range_kernel<__sycl_shim2()> : std::true_type {}; + template<> struct is_nd_range_kernel<__sycl_shim1()> : std::true_type {}; + template<> struct is_nd_range_kernel<__sycl_shim2()> : std::true_type {}; } ``` @@ -894,7 +825,7 @@ Thus, the {cpp} unqualified name lookup algorithm, finds the correct function definition. However, each helper function has a unique name, so it can be uniquely identified from the `sycl` namespace, where it is called to specialize the -`is_range_kernel` trait. +`is_nd_range_kernel` trait. === Decomposed kernel arguments @@ -910,8 +841,8 @@ argument. As a result, {dpcpp} passes each member variable as a separate OpenCL kernel argument. -An argument like this that is decomposed is still represent as a single -argument in SYCL source code. +A decomposed argument like this is still represented as a single argument in +SYCL source code. When invoking a free function kernel, the application sets the value of such an argument with a single call to `handler::set_arg`. For example, the application sets the value of an `accessor` by calling @@ -952,26 +883,26 @@ argument, effectively turning the call into a no-op. One option is like this: + ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void iota(float start, float *ptr) { /*...*/ } int main() { sycl::queue q; float *ptr = sycl::malloc_shared(N, q); - sycl::parallel_for(q, {N}, 1.f, ptr); + sycl::nd_launch(q, sycl::nd_range{{N}, {WGS}}, 1.f, ptr); } ``` + Another option is like this: + ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) void iota(float start, float *ptr) { /*...*/ } int main() { sycl::queue q; float *ptr = sycl::malloc_shared(N, q); - sycl::parallel_for(q, {N}, kfp, 1.f, ptr); + sycl::nd_launch(q, sycl::nd_range{{N}, {WGS}}, kfp, 1.f, ptr); } ``` + @@ -982,8 +913,8 @@ Where `kfp` would have some nicer name. + -- ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) -void iota(sycl::item<1> item, float start, float *ptr) { /*...*/ } +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void iota(sycl::nd_item<1> nditem, float start, float *ptr) { /*...*/ } ``` The advantage is that the user wouldn't need to use the functions in @@ -992,37 +923,40 @@ sycl_ext_oneapi_free_function_queries] to get the iteration index. Doing this raises some new questions, though: ** When the application sets the value of a kernel parameter via `set_arg`, - does argument index `0` correspond to the `item` or to the first parameter - after `item`? + does argument index `0` correspond to the `nd_item` or to the first + parameter after `nd_item`? For example, to set the value of `start` in the example above, does the application call `+set_arg(0, ...)+` or `+set_arg(1, ...)+`? Both seem like reasonable choices, so many users may need to read the documentation to determine what is right. -** If the first parameter is an index like `sycl::item<1>`, then the property - `syclex::range_kernel<1>` is somewhat redundant. +** If the first parameter is an index like `sycl::nd_item<1>`, then the + property `syclexp::nd_range_kernel<1>` is somewhat redundant. Should the compiler raise a diagnostic if they do not match? Or, should we invent a new property like: + ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::kernel_function)) -void iota(sycl::item<1> item, float start, float *ptr) { /*...*/ } +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::kernel_function)) +void iota(sycl::nd_item<1> item, float start, float *ptr) { /*...*/ } ``` -** In standard SYCL kernels, the iteration index can be anything that is - convertible from `sycl::item` or `sycl::nd_item`. - For example, it is common to use `id` for range kernels or `int` for - 1-dimensional range kernels. - However, both `id` and `int` can also be used as kernel parameters. - Therefore, something like this is ambiguous: +** In a standard SYCL nd-range kernel, the iteration index can be anything that + is convertible from `sycl::nd_item`. + For example, an application can define its own type like this: + ``` -SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclex::range_kernel<1>)) -void iota(sycl::id<1> i, float start, float *ptr) { /*...*/ } +struct global_index { + global_index(const sycl::nd_item<1> &ndi) {id = ndi.get_global_linear_id();} + size_t id; +}; + +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>)) +void iota(global_index i, float start, float *ptr) { /*...*/ } ``` + +However, this is potentially ambiguous. Is `i` the kernel's iteration index, or is it simply a kernel argument whose -type is `sycl::id`? +type is `global_index`? -- + We agreed that we _do_ need to support free function kernels that do not have @@ -1057,3 +991,26 @@ first parameter is an iteration index. probably `errc::invalid`)? Again, I think we should decide during implementation whether the overhead is minimal enough that we can mandate an error in the spec. + + +== Resolved issues + +* We considered supporting simple range kernels with the free function kernel + syntax, but we decided against it. + We want to give the implementation greater freedom to handle unusual ranges + for these kernels. + For example, we want to allow the implementation to do "range rounding" when + the range is not evenly divisible by a convenient work-group size. + To do this, the implementation rounds the range up to a convenient value and + also wraps the user's kernel with a function that skips the extra iterations. + We also want to allow the implementation to support very large ranges via a + wrapper that invokes the user's kernel multiple times for each invocation of + the wrapped kernel. + In both cases, the wrapper function would need to synthesize an `item` object + and pass this object to the user's kernel. + This is not possible, though, if the user's kernel gets the `item` object via + a free function like `this_work_item::get_item()`. + Since free function kernels are an advanced feature, we think it is OK if + they are limited to nd-range kernels. + Since single-task kernels present no obstacles, we also support these with + the free function kernel syntax.