From ebc3ddb9a60bad6f1f0e331da4044000e3a7fb60 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 10 Nov 2023 14:29:49 -0800 Subject: [PATCH] [SYCL][DOC] Add sycl_ext_oneapi_enqueue_functions (#11491) Adds a new extension proposal to simplify enqueuing work to a device. - Uses different function names for different use-cases. - Uses free-functions instead of member functions. - Requires developers to opt-in to the creation of event objects. --------- Signed-off-by: John Pennycook Co-authored-by: Ben Ashbaugh Co-authored-by: Greg Lueck --- ...sycl_ext_oneapi_enqueue_functions.asciidoc | 729 ++++++++++++++++++ 1 file changed, 729 insertions(+) create mode 100644 sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc new file mode 100644 index 0000000000000..a2bc1bab79d7e --- /dev/null +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc @@ -0,0 +1,729 @@ += sycl_ext_oneapi_enqueue_functions + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] +:endnote: —{nbsp}end{nbsp}note + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2023-2023 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Dependencies + +This extension is written against the SYCL 2020 revision 7 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + + +== Status + +This is a proposed extension specification, intended to gather community +feedback. Interfaces defined in this specification may not be implemented yet +or may be in a preliminary state. The specification itself may also change in +incompatible ways before it is finalized. *Shipping software products should +not rely on APIs defined in this specification.* + + +== Overview + +SYCL 2020 provides multiple ways to enqueue work to a device. In some cases, a +single function name is used to enqueue kernels with very different use-cases +and execution models (e.g., `parallel_for(range)` and +`parallel_for(nd_range)`). In almost all cases, the functions are available in +multiple places (e.g., `queue::parallel_for` and `handler::parallel_for`). +In all cases, these functions return an `event` object by default, which has +been shown to introduce undesirable performance overhead. + +Additionally, we have received feedback from developers and implementers alike +that the number of `parallel_for` overloads is confusing, and that the way +reductions in particular are specified (as a parameter pack containing both +`sycl::reduction` objects and a kernel lambda) is problematic. + +This extension addresses these issues by: + +1. Using different function names for different use-cases. +2. Using free-functions instead of member functions. +3. Requiring developers to opt-in to the creation of `event` objects. +4. Bundling everything related to a kernel's launch configuration (i.e., its +range, any launch properties) into a single object. +5. Moving the reductions parameter pack after the kernel. + +This extension makes SYCL simpler and easier to document. It is also expected +to improve the performance of many SYCL applications, where `event` objects are +not required to describe application behavior. + +All functions proposed in this extension accept as their first argument an +object that represents where a command should be submitted, allowing the new +functions to be used either at command-group scope or as a replacement for +existing queue shortcuts. A future version of this extension may adjust this +overload set to include functions compatible with future C++ concepts (e.g, +by accepting a scheduler and returning a sender). + + +=== Usage example + +The example below demonstrates that the syntax proposed here requires only +minor changes to existing applications, while retaining their structure. + + +==== SYCL 2020 + +[source,c++] +---- +q.submit([&](sycl::handler& h) { +  sycl::accessor result { buf, h, sycl::write_only, sycl::no_init }; +  h.parallel_for(1024, [=](sycl::id<1> idx) { +    result[idx] = idx; +  }); +}); + +float* output = sycl::malloc_shared(1, q); +*output = 0; +std::vector depEvents = /* some dependencies */; +sycl::event e = q.parallel_for(sycl::nd_range<1>{1024, 16}, depEvents, + sycl::reduction(output, sycl::plus<>()), + [=](sycl::nd_item<1> it, auto& sum) { + sum += it.get_global_id(); +}); +e.wait(); +sycl::free(output, q); +---- + + +==== Proposed syntax + +[source,c++] +---- +using syclex = sycl::ext::oneapi::experimental; + +syclex::submit(q, [&](sycl::handler& h) { +  sycl::accessor result { buf, h, sycl::write_only, sycl::no_init }; +  syclex::parallel_for(h, 1024, [=](sycl::id<1> idx) { +    result[idx] = idx; +  }); +}); + +float* output = sycl::malloc_shared(1, q); +*output = 0; +std::vector depEvents = /* some dependencies */; +sycl::event e = syclex::submit_with_event(q, [&](sycl::handler& h) { + h.depends_on(depEvents); + syclex::nd_launch(h, sycl::nd_range<1>{1024, 16}, + [=](sycl::nd_item<1> it, auto& sum) { + sum += it.get_global_id(); + }, + sycl::reduction(output, sycl::plus<>()) +}); +e.wait(); +sycl::free(output, q); +---- + + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS` to one of the values defined in the +table below. Applications can test for the existence of this macro to +determine if the implementation supports this feature, or applications can test +the macro's value to determine which of the extension's features the +implementation supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not versioned, so the + feature-test macro always has this value. +|=== + + +=== Launch configuration + +A launch configuration object of type `launch_config` is used to bundle +together all components of a kernel's launch configuration, including: + +1. The range of execution. +2. Any compile-time properties. + +Any compile-time properties passed as part of a `launch_config` only affect the +way in which the kernel is launched. They cannot be used to define information +about the kernel itself. This extension does not define any properties for +`launch_config`, but other extensions are expected to define such properties. + +[_Note:_ The properties defined in the +link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_kernel_properties] +extension (e.g., `work_group_size`) cannot be used via `launch_config`. In +order to use these properties with a kernel, the kernel must be a named +functioned object which exposes the properties via +`get(sycl::ext::oneapi::experimental::properties_tag)` as described in that +extension. _{endnote}_] + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +// Available only when Range is sycl::range or sycl::nd_range +template +class launch_config { +public: + launch_config(Range, Properties = {}); +}; + +} +---- + +[source,c++] +---- +launch_config(Range, Properties); +---- +_Constraints_: Available only if `Range` is a `sycl::range` or +`sycl::nd_range`, and `Properties` is a compile-time property list. + +_Effects_: Constructs a `launch_config` from the specified range and +properties. + + +=== Command-group submission + +When specifying event dependencies or requesting the creation of events, +commands must be wrapped in a _command-group_. + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void submit(sycl::queue q, CommandGroupFunc&& cgf); + +} +---- +!==== +_Effects_: Submits a command-group function object (as defined by the SYCL +specification) to the `sycl::queue`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +sycl::event submit_with_event(sycl::queue q, CommandGroupFunc&& cgf); + +} +---- +!==== +_Effects_: Submits a command-group function object (as defined by the SYCL +specification) to the `sycl::queue`. + +_Returns_: A `sycl::event` associated with the submitted command. + +|==== + + +=== Commands + +==== Single tasks + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void single_task(sycl::queue q, const KernelType& k); + +template +void single_task(sycl::handler h, const KernelType& k); + +} +---- +!==== +_Effects_: Enqueues a kernel function to the `sycl::queue` or `sycl::handler` +as a single task. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void single_task(sycl::queue q, const sycl::kernel& k, Args&&... args); + +template +void single_task(sycl::handler h, const sycl::kernel& k, Args&&... args); + +} +---- +!==== +_Effects_: Enqueues a kernel object to the `sycl::queue` or `sycl::handler` as +a single task. The arguments in `args` are passed to the kernel in the same +order. + +|==== + + +==== Basic kernels + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void parallel_for(sycl::queue q, sycl::range r, + const KernelType& k, Reductions&&... reductions); + +template +void parallel_for(sycl::handler h, sycl::range r, + const KernelType& k, Reductions&&... reductions); + +} +---- +!==== +_Constraints_: The parameter pack consists of 0 or more objects created by the +`sycl::reduction` function. + +_Effects_: Enqueues a kernel function to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the number of work-items specified by a `sycl::range`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void parallel_for(sycl::queue q, + launch_config, Properties> c, + const KernelType& k, Reductions&&... reductions); + +template +void parallel_for(sycl::handler h, + launch_config, Properties> c, + const KernelType& k, Reductions&&... reductions); + +} +---- +!==== +_Constraints_: The parameter pack consists of 0 or more objects created by the +`sycl::reduction` function. + +_Effects_: Enqueues a kernel function to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the launch configuration specified by a +`launch_config`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void parallel_for(sycl::queue q, sycl::range r, + const sycl::kernel& k, Args&&... args); + +template +void parallel_for(sycl::handler h, sycl::range r, + const sycl::kernel& k, Args&&... args); + +} +---- +!==== +_Effects_: Enqueues a kernel object to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the number of work-items specified by a `sycl::range`. +The arguments in `args` are passed to the kernel in the same order. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void parallel_for(sycl::queue q, + launch_config, Properties> c, + const sycl::kernel& k, Args&& args...); + +template +void parallel_for(sycl::handler h, + launch_config, Properties> c, + const sycl::kernel& k, Args&& args...); + +} +---- +!==== +_Effects_: Enqueues a kernel object to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the launch configuration specified by a +`launch_config`. The arguments in `args` are passed to the kernel in the same +order. + +|==== + + +==== ND-range kernels + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void nd_launch(sycl::queue q, sycl::nd_range r, + const KernelType& k, Reductions&&... reductions); + +template +void nd_launch(sycl::handler h, sycl::nd_range r, + const KernelType& k, Reductions&&... reductions); + +} +---- +!==== +_Constraints_: The parameter pack consists of 0 or more objects created by the +`sycl::reduction` function. + +_Effects_: Enqueues a kernel function to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the number of work-items specified by a +`sycl::nd_range`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void nd_launch(sycl::queue q, + launch_config, Properties> c, + const KernelType& k, Reductions&&... reductions); + +template +void nd_launch(sycl::handler h, + launch_config, Properties> c, + const KernelType& k, Reductions&&... reductions); + +} +---- +!==== +_Constraints_: The parameter pack consists of 0 or more objects created by the +`sycl::reduction` function. + +_Effects_: Enqueues a kernel function to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the launch configuration specified by a +`launch_config`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void nd_launch(sycl::queue q, sycl::nd_range r, + const sycl::kernel& k, Args&&... args); + +template +void nd_launch(sycl::handler h, sycl::nd_range r, + const sycl::kernel& k, Args&&... args); + +} +---- +!==== +_Effects_: Enqueues a kernel object to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the number of work-items specified by a +`sycl::nd_range`. The arguments in `args` are passed to the kernel in the same +order. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void nd_launch(sycl::queue q, + launch_config, Properties> c, + const sycl::kernel& k, Args&& args...); + +template +void nd_launch(sycl::handler h, + launch_config, Properties> c, + const sycl::kernel& k, Args&& args...); + +} +---- +!==== +_Effects_: Enqueues a kernel object to the `sycl::queue` or `sycl::handler` +as a basic kernel, using the launch configuration specified by a +`launch_config`. The arguments in `args` are passed to the kernel in the same +order. + +|==== + + +==== Memory operations + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void memcpy(sycl::queue q, void* dest, const void* src, size_t numBytes); + +void memcpy(sycl::handler h, void* dest, const void* src, size_t numBytes); + +} +---- +!==== +_Effects_: Enqueues a `memcpy` to the `sycl::queue` or `sycl::handler`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void copy(sycl::queue q, const T* src, T* dest, size_t count); + +template +void copy(sycl::handler h, const T* src, T* dest, size_t count); + +} +---- +!==== +_Effects_: Enqueues a `copy` to the `sycl::queue` or `sycl::handler`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void memset(sycl::queue q, void* ptr, int value, size_t numBytes); + +void memset(sycl::handler h, void* ptr, int value, size_t numBytes); + +} +---- +!==== +_Effects_: Enqueues a `memset` to the `sycl::queue` or `sycl::handler`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +template +void fill(sycl::queue q, T* ptr, const T& pattern, size_t count); + +template +void fill(sycl::handler h, T* ptr, const T& pattern, size_t count); + +} +---- +!==== +_Effects_: Enqueues a `fill` to the `sycl::queue` or `sycl::handler`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void prefetch(sycl::queue q, void* ptr, size_t numBytes); + +void prefetch(sycl::handler h, void* ptr, size_t numBytes); + +} +---- +!==== +_Effects_: Enqueues a `prefetch` to the `sycl::queue` or `sycl::handler`. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void mem_advise(sycl::queue q, void* ptr, size_t numBytes, int advice); + +void mem_advise(sycl::handler h, void* ptr, size_t numBytes, int advice); + +} +---- +!==== +_Effects_: Enqueues a `mem_advise` to the `sycl::queue` or `sycl::handler`. + +|==== + + +==== Command barriers + +The functions in this section are only available if the +link:../supported/sycl_ext_oneapi_enqueue_barrier.asciidoc[ + sycl_ext_oneapi_enqueue_barrier] extension is supported. + +|==== +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void barrier(sycl::queue q); + +void barrier(sycl::handler h); + +} +---- +!==== +_Effects_: Enqueues a command barrier to the `sycl::queue` or `sycl::handler`. +Any commands submitted after this barrier cannot begin execution until all +previously submitted commands (and any commands associated with dependendent +events) have completed. + +a| +[frame=all,grid=none] +!==== +a! +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +void partial_barrier(sycl::queue q, const std::vector& events); + +void partial_barrier(sycl::handler h, const std::vector& events); + +} +---- +!==== +_Effects_: Enqueues a _partial_ command barrier to the `sycl::queue` or +`sycl::handler`. Any commands submitted after this barrier cannot begin +execution until all commands associated with `events` (and any commands +associated with other dependent events) have completed. + +[_Note:_ If `events` is empty and a partial barrier has no other dependencies +(e.g., specified by `handler::depends_on`), it is not required to wait for any +commands unless the `queue` is in-order. Implementations may be able to +optimize such partial barriers. +_{endnote}_] +|==== + + +== Issues + +. What should `submit_with_event` be called? ++ +-- +*UNRESOLVED*: `submit_with_event` is descriptive but verbose. Synonyms for +`submit` like `enqueue` do not obviously mean "return an event". `record` may +be confused with the recording functionality associated with SYCL graphs. +-- + +. What about `accessor` overloads and `update_host`? ++ +-- +*UNRESOLVED*: Supporting `accessor` overloads with this new approach is +possible, but additional design work is required to understand how to handle +placeholder accessors. Whether `update_host` should be exposed via this new +free-function interface is an open question. +--