From 419fe40a3c925dfe1b565d00aa045d5a6dde9d1a Mon Sep 17 00:00:00 2001
From: Matthias Knorr <matthias@streamhpc.com>
Date: Fri, 23 Jun 2023 15:56:11 +0200
Subject: [PATCH] Multi-device C/C++ sample

---
 .github/workflows/presubmit.yml          |   6 +-
 samples/core/CMakeLists.txt              |   1 +
 samples/core/multi-device/CMakeLists.txt |  27 +
 samples/core/multi-device/README.md      | 184 +++++++
 samples/core/multi-device/convolution.cl |  52 ++
 samples/core/multi-device/main.c         | 646 +++++++++++++++++++++++
 samples/core/multi-device/main.cpp       | 501 ++++++++++++++++++
 7 files changed, 1415 insertions(+), 2 deletions(-)
 create mode 100644 samples/core/multi-device/CMakeLists.txt
 create mode 100644 samples/core/multi-device/README.md
 create mode 100644 samples/core/multi-device/convolution.cl
 create mode 100644 samples/core/multi-device/main.c
 create mode 100644 samples/core/multi-device/main.cpp

diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 01731549..8db9ca68 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -702,8 +702,10 @@ jobs:
     - name: Test
       working-directory: ${{runner.workspace}}/OpenCL-SDK/build
       run: |
-        OCL_ICD_VENDORS=/usr/local/etc/OpenCL/vendors ctest -C Debug --output-on-failure --no-tests=error --parallel `sysctl -n hw.logicalcpu`
-        OCL_ICD_VENDORS=/usr/local/etc/OpenCL/vendors ctest -C Release --output-on-failure --no-tests=error --parallel `sysctl -n hw.logicalcpu`
+        # Unreproducible segfault
+        export EXCLUDE_REGEX='multidevice.*'
+        OCL_ICD_VENDORS=/usr/local/etc/OpenCL/vendors ctest -C Debug --output-on-failure --no-tests=error --parallel `sysctl -n hw.logicalcpu` --exclude-regex "$EXCLUDE_REGEX"
+        OCL_ICD_VENDORS=/usr/local/etc/OpenCL/vendors ctest -C Release --output-on-failure --no-tests=error --parallel `sysctl -n hw.logicalcpu` --exclude-regex "$EXCLUDE_REGEX"
 
     - name: Install
       run: |
diff --git a/samples/core/CMakeLists.txt b/samples/core/CMakeLists.txt
index 75d8357d..558902e5 100644
--- a/samples/core/CMakeLists.txt
+++ b/samples/core/CMakeLists.txt
@@ -16,6 +16,7 @@ add_subdirectory(enumopencl)
 add_subdirectory(copybuffer)
 add_subdirectory(copybufferkernel)
 add_subdirectory(saxpy)
+add_subdirectory(multi-device)
 add_subdirectory(reduce)
 add_subdirectory(blur)
 add_subdirectory(binaries)
diff --git a/samples/core/multi-device/CMakeLists.txt b/samples/core/multi-device/CMakeLists.txt
new file mode 100644
index 00000000..ccf8933b
--- /dev/null
+++ b/samples/core/multi-device/CMakeLists.txt
@@ -0,0 +1,27 @@
+# Copyright (c) 2023 The Khronos Group Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_sample(
+    TEST
+    TARGET multidevice
+    VERSION 300
+    SOURCES main.c
+    KERNELS convolution.cl)
+
+add_sample(
+    TEST
+    TARGET multidevicecpp
+    VERSION 300
+    SOURCES main.cpp
+    KERNELS convolution.cl)
diff --git a/samples/core/multi-device/README.md b/samples/core/multi-device/README.md
new file mode 100644
index 00000000..f259482b
--- /dev/null
+++ b/samples/core/multi-device/README.md
@@ -0,0 +1,184 @@
+# Multi-device Convolution Example
+
+## Sample purpose
+This example showcases how to set up a multi-device execution of a given kernel using two OpenCL-compatible devices.
+
+## Key APIs and Concepts
+The main idea behind this example is that a given kernel can be run simultaneously by two (or potentially more) devices, therefore reducing its execution time. One can essentially think of two strategies for this workflow:
+1. each device computes its proportional part of the solution at its own speed and the results are combined on the host's side when finished, and
+2. each device executes the kernel at its own speed but after each iteration there is P2P communication between the devices to share the partial results.
+
+This example implements the first approach.
+
+### Kernel logic
+The kernel is a simple $3 \times 3$ convolution, meaning that the convolution over the input matrix is performed using a $3 \times 3$ mask matrix.
+
+In this implementation of the convolution kernel we assume that the input matrix is padded with 0s, so no extra conditional logic is necessary to ensure that the mask is not applied to out-of-bounds elements (e.g. when processing element $(0,0)$ of the output matrix).
+
+### Device fission
+In order to simplify the conditions under which the example can be executed, we introduced the use of OpenCL's device fission. This feature allows the user to partition a device into *sub-devices*. These sub-devices correspond physically to a certain region of the original device, but are virtually perceived as whole new devices. This partition of the device can be made in several ways.
+- Partition equally by the number compute units (threads). After specifying the number of compute units that each sub-device should have, OpenCL creates as many sub-devices as possible under that restriction. If the number of compute units specified does not divide the total amount of compute units available, the leftovers do not get assigned to any sub-device. This option may be used when we want to enable task parallelism in our program, as tasks can be evenly distributed among the sub-devices.
+
+- Partition by counts (of compute units). With this option we can specify the exact number of compute units that we want for each sub-device. This approach may be used when we want to isolate some part of the device for high priority tasks while preventing the lower priority ones from interrupting/interfering with them.
+
+- Partition by affinity domain. The device is split into sub-devices containing compute units that share part of a cache hierarchy. For instance, when executing high-throughput jobs with little shared memory in a NUMA multiprocessor it could be beneficial for maximizing the throughput to partition the device so compute units from the same NUMA node are grouped together. That way each job can run on a sub-device (NUMA node) and get all of its resources without competing with the other jobs. On the other hand, if the program requires a great amount of shared memory, creating sub-devices that group compute units sharing the same cache can be the best option.
+
+This sample tries to exploit task parallelism, so the first approach is the one used: from one device we create two sub-devices, each with half of the available compute units.
+
+_Note: A device can be fissioned in more than one level, meaning that a sub-device of a device can also be partitioned into multiple (sub-)sub-devices._
+
+#### Sub-buffers
+Global buffer objects in OpenCL are one-dimensional collections of elements. From these objects can be obtained new buffer objects, known as sub-buffers. The main use-cases of these sub-buffers are the following:
+- When we need accessing a buffer with different access flags than were specified in buffer creation. E.g. if we create a global buffer with `READ_WRITE` permissions, we then can create two sub-buffers from it, one with `READ_ONLY` permissions and other with `WRITE_ONLY` permissions. Therefore, being able to pass the same information with different permissions to different kernels, which can come in handy when one of the kernels that access the buffer does not perform writes or reads on the buffer as some internal coherence routines can be omitted when launching the kernels.
+- When it's necessary to pass subsets of the same buffer to different kernels calls. E.g. in this sample we need to enqueue one kernel call to one of the sub-devices that convolutes the left half of the matrix and another one to the other sub-device which convolutes the right part of the matrix.
+
+_Note: Unlike sub-devices, a sub-buffer of a global buffer cannot be partitioned again into (sub-)sub-buffers._
+
+## Application flow
+### Overview
+1. Select a device. By default the application will select the first device available, but we provide a command-line option to let user specify which type of device prefers to use (e.g. "cpu" or "gpu").
+2. Query compute units available on the device and create two sub-devices from it with half of the compute units each.
+3. Compile kernel.
+4. Initialize host-side input and output matrices. Pad input matrix with 0s so the convolution kernel does not access to out-of-bounds elements.
+5. Initialize device-side global buffers.
+6. Set up OpenCL objects for the sub-devices. In particular, create sub-buffers for input and output matrices.
+7. Enqueue kernel calls on each device with the correspondent arguments and wait until they finish.
+8. Run the host-side convolution algorithm.
+9. Fetch and combine results from devices. Compare the solution obtained with the host's and print to the standard output the result of this validation.
+10. Free memory and OpenCL resources.
+
+### Device fission
+Before creating sub-devices from a given device we must think about which partitioning approach is the most appropriate for the kernel/s at hand. In our case, we would like to exploit task parallelism, as the objective is to perform the convolution using 2 devices at the same time to speed it up. Therefore, the best approach is to create two sub-devices with equal number of compute units.
+
+As we don't need to perform any other task, we can use all the compute units, so we query how many compute units are available in total using the `clGetDeviceInfo`/`cl::Device::getInfo` function with the `CL_DEVICE_MAX_COMPUTE_UNITS` parameter.
+
+With this information we can then create an array of `cl_device_partition_property` containing the properties of the partition of the device. In our case, we must specify that we want to partition the device equally by adding the macro `CL_DEVICE_PARTITION_EQUALLY` and we must indicate how many compute units each device will get, which is half of the maximum available.
+
+Lastly, we use the `clCreateSubDevices`/`cl::Device::createSubDevices` function to fission the device.
+
+### Sub-buffers creation
+For creating a sub-buffer from a global buffer object we first need to determine two important parameters:
+- Which permissions will it have. It cannot have more permissions than the original buffer, e.g. if the global buffer was declared as read-only, the subsequent sub-buffers created from it cannot be write-only or read-write.
+
+- What range from the global buffer will be mapped onto the sub-buffer. We need to consider which kernel is going to take the sub-buffer as input and/or output and determine which range from the global buffer must be mapped.
+
+In our case we use two read-only input buffers, one for the input matrix and one for the mask, and one write-only output buffer. However, we only need sub-buffers for the input and output matrix, as the mask is the same for both kernel calls. Thus, we only create two sub-buffers from the input global buffer and two more from the output one.
+The flags that we set when creating them are `CL_MEM_READ_ONLY` for the input sub-buffers and `CL_MEM_WRITE_ONLY` for the output ones.
+
+For the ranges mapped into the sub-buffers, we take half of the input matrix[^1] for each sub-buffer and half of the output buffer for each too.
+
+[^1]:_The input buffers are actually overlapped, as we need one extra column after/before the middle column when enqueuing the first/second call to the kernel for performing the convolution correctly._
+
+### Kernel launch
+The rest of the program does not differ much from the usual single-device kernel launch. The only difference is that each sub-device will need a separate set of runtime objects to be created: device objects, kernel functors, command queues and events.
+
+Once everything is set up, a kernel call is enqueued to the command queue of each device with the correspondent input and output parameters, and two different events are used to wait for them to be finished. When the devices finish the computations, the results are combined in a single host matrix and compared to the host-side results.
+
+## Used API surface
+### C
+```c
+CL_BLOCKING
+CL_DEVICE_MAX_COMPUTE_UNITS
+CL_DEVICE_PARTITION_EQUALLY
+CL_DEVICE_PLATFORM
+CL_DEVICE_TYPE_ALL
+CL_HPP_TARGET_OPENCL_VERSION
+CL_INVALID_ARG_VALUE
+CL_KERNEL_WORK_GROUP_SIZE
+CL_MEM_COPY_HOST_PTR
+CL_MEM_HOST_READ_ONLY
+CL_MEM_READ_ONLY
+CL_MEM_WRITE_ONLY
+CL_PROFILING_COMMAND_END
+CL_PROFILING_COMMAND_START
+CL_QUEUE_PROFILING_ENABLE
+CL_QUEUE_PROPERTIES
+CL_SUCCESS
+cl_buffer_create_type
+cl_command_queue
+cl_command_queue_properties
+cl_context
+cl_device_partition_property
+cl_device_type
+cl_event
+cl_float
+cl_int
+cl_kernel
+cl_mem
+cl_mem_flags
+cl_platform_id
+cl_program
+cl_sdk_fill_with_random_ints_range(pcg32_random_t*, cl_int*, size_t, cl_int, cl_int)
+cl_sdk_options_DeviceTriplet
+cl_sdk_options_Diagnostic
+cl_sdk_options_SingleDevice
+cl_uint
+cl_uint2
+cl_ulong
+cl_util_build_program(cl_program, cl_device_id, char*)
+cl_util_get_device(cl_uint, cl_uint, cl_device_type, cl_int*)
+cl_util_get_event_duration(cl_event, cl_profiling_info, cl_profiling_info, cl_int*)
+cl_util_print_device_info*(cl_device_id)
+cl_util_print_error(cl_int)
+cl_util_read_text_file(char*const, size_t*const, cl_int*)
+get_dev_type(char*)
+clCreateBuffer(cl_context, cl_mem_flags, size_t, void*, cl_int*)
+clCreateSubBuffer(cl_mem, cl_mem_flags, cl_buffer_create_type, const void*, cl_int*)
+clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int*)
+clCreateCommandQueueWithProperties(cl_context, cl_device_id, cl_queue_properties*, cl_int*) -> OpenCL >= 2.0
+clCreateContext(cl_context_properties*, cl_uint, cl_device_id*, void *(char*, void*,size_t, void*), void*, cl_int*)
+clCreateKernel(cl_program, char*, cl_int*)
+clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void*, size_t*)
+clCreateProgramWithSource(cl_context, cl_uint, char**, size_t*, cl_int*)
+clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, size_t*, size_t*, size_t*, cl_uint, cl_event*, cl_event*)
+clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void*, cl_uint, cl_event*, cl_event*)
+clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id*, cl_uint*)
+clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void*, size_t*)
+clGetPlatformIDs(cl_uint, cl_platform_id*, cl_uint*)
+clReleaseCommandQueue(cl_command_queue)
+clReleaseContext(cl_context)
+clReleaseKernel(cl_kernel)
+clReleaseMemObject(cl_mem)
+clReleaseProgram(cl_program)
+clSetKernelArg(cl_kernel, cl_uint, size_t, void *)
+clWaitForEvents(cl_uint, cl_event*)
+```
+
+### C++
+```c++
+cl::Buffer::Buffer(const Context&, cl_mem_flags, size_type, void*, cl_int*=NULL)
+cl::Buffer::createSubBuffer(cl_mem_flags, cl_buffer_create_type, const void*, cl_int*=NULL)
+cl::BuildError
+cl::CommandQueue::CommandQueue(const cl::Context&, cl::QueueProperties, cl_int*=NULL)
+cl::CommandQueue::enqueueReadBuffer(const Buffer&, cl_bool, size_type, size_type, void*, const std::vector<cl::Event>*=nullptr, cl::Event*=nullptr)
+cl::Context
+cl::Device::Device()
+cl::Device::createSubDevices(const cl_device_partition_property*, std::vector<cl::Device>*)
+cl::EnqueueArgs::EnqueueArgs(cl::CommandQueue&, cl::NDRange, cl::NDRange)
+cl::Error
+cl::Event
+cl::Kernel
+cl::KernelFunctor::KernelFunctor(const Program&, const string, cl_int*=NULL)
+cl::NDRange::NDRange(size_t, size_t)
+cl::NullRange
+cl::Platform::Platform()
+cl::Platform::Platform(cl::Platform)
+cl::Platform::get(vector<cl::Platform>*)
+cl::Program::Program()
+cl::Program::Program(cl::Program)
+cl::WaitForEvents(const vector<cl::Event>&)
+cl::copy(const CommandQueue&, const cl::Buffer&, IteratorType, IteratorType)
+cl::sdk::comprehend()
+cl::sdk::fill_with_random()
+cl::sdk::get_context(cl_uint, cl_uint, cl_device_type, cl_int*)
+cl::sdk::options::SingleDevice
+cl::sdk::parse()
+cl::sdk::parse_cli()
+cl::sdk::options::DeviceTriplet
+cl::sdk::options::Diagnostic
+cl::sdk::options::SingleDevice
+cl::string::string(cl::string)
+cl::util::Error
+cl::util::get_duration(cl::Event&)
+cl::util::opencl_c_version_contains(const cl::Device&, const cl::string&)
+```
diff --git a/samples/core/multi-device/convolution.cl b/samples/core/multi-device/convolution.cl
new file mode 100644
index 00000000..90b229a5
--- /dev/null
+++ b/samples/core/multi-device/convolution.cl
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+kernel void convolution_3x3(const global float* in, global float* out,
+                            const global float* mask, const uint2 out_dim)
+{
+    const uint2 gid = (uint2)(get_global_id(0), get_global_id(1));
+    const uint mask_dim = 3;
+    const uint pad_width = mask_dim / 2;
+
+    // Padded constants.
+    const uint2 in_dim = out_dim + pad_width * 2;
+
+    // Check possible out of bounds.
+    if (!(gid.x < out_dim.x && gid.y < out_dim.y))
+    {
+        return;
+    }
+
+    // Perform convolution. Fix one column at a time and iterate over each
+    // element of it, as data is stored column-major.
+    float result = 0.0f;
+    #if __OPENCL_C_VERSION__ >= 200
+    __attribute__((opencl_unroll_hint))
+    #endif
+    for(uint y = 0; y < mask_dim; ++y)
+    {
+        #if __OPENCL_C_VERSION__ >= 200
+        __attribute__((opencl_unroll_hint))
+        #endif
+        for(uint x = 0; x < mask_dim; ++x)
+        {
+            result += mask[y * mask_dim + x] * in[(gid.y + y) * in_dim.x + (gid.x + x)];
+        }
+    }
+
+    // Write result to correspoding output cell.
+    out[gid.y * out_dim.x + gid.x] = result;
+}
diff --git a/samples/core/multi-device/main.c b/samples/core/multi-device/main.c
new file mode 100644
index 00000000..28e7b28e
--- /dev/null
+++ b/samples/core/multi-device/main.c
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c) 2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// OpenCL SDK includes.
+#include <CL/SDK/CLI.h>
+#include <CL/SDK/Context.h>
+#include <CL/SDK/Options.h>
+#include <CL/SDK/Random.h>
+
+// OpenCL Utils includes.
+#include <CL/Utils/Error.h>
+#include <CL/Utils/Event.h>
+#include <CL/Utils/Utils.h>
+
+// Standard header includes.
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Sample-specific options.
+struct convolution_options
+{
+    size_t x_dim;
+    size_t y_dim;
+};
+
+// Add option to CLI-parsing SDK utility for input dimensions.
+cag_option ConvolutionOptions[] = { { .identifier = 'x',
+                                      .access_letters = "x",
+                                      .access_name = "x_dim",
+                                      .value_name = "(positive integral)",
+                                      .description = "x dimension of input" },
+
+                                    { .identifier = 'y',
+                                      .access_letters = "y",
+                                      .access_name = "y_dim",
+                                      .value_name = "(positive integral)",
+                                      .description = "y dimension of input" } };
+
+ParseState parse_ConvolutionOptions(const char identifier,
+                                    cag_option_context* cag_context,
+                                    struct convolution_options* opts)
+{
+    const char* value;
+
+    switch (identifier)
+    {
+        case 'x':
+            if ((value = cag_option_get_value(cag_context)))
+            {
+                opts->x_dim = strtoull(value, NULL, 0);
+                return ParsedOK;
+            }
+            else
+                return ParseError;
+        case 'y':
+            if ((value = cag_option_get_value(cag_context)))
+            {
+                opts->y_dim = strtoull(value, NULL, 0);
+                return ParsedOK;
+            }
+            else
+                return ParseError;
+    }
+    return NotParsed;
+}
+
+cl_int parse_options(int argc, char* argv[],
+                     struct cl_sdk_options_Diagnostic* diag_opts,
+                     struct cl_sdk_options_SingleDevice* dev_opts,
+                     struct convolution_options* convolution_opts)
+{
+    cl_int error = CL_SUCCESS;
+    struct cag_option *opts = NULL, *tmp = NULL;
+    size_t n = 0;
+
+    // Prepare options array.
+    MEM_CHECK(opts = add_CLI_options(opts, &n, DiagnosticOptions,
+                                     CAG_ARRAY_SIZE(DiagnosticOptions)),
+              error, end);
+    MEM_CHECK(tmp = add_CLI_options(opts, &n, SingleDeviceOptions,
+                                    CAG_ARRAY_SIZE(SingleDeviceOptions)),
+              error, end);
+    opts = tmp;
+    MEM_CHECK(tmp = add_CLI_options(opts, &n, ConvolutionOptions,
+                                    CAG_ARRAY_SIZE(ConvolutionOptions)),
+              error, end);
+    opts = tmp;
+
+    char identifier;
+    cag_option_context cag_context;
+
+    // Prepare the context and iterate over all options.
+    cag_option_prepare(&cag_context, opts, n, argc, argv);
+    while (cag_option_fetch(&cag_context))
+    {
+        ParseState state = NotParsed;
+        identifier = cag_option_get(&cag_context);
+
+        PARS_OPTIONS(parse_DiagnosticOptions(identifier, diag_opts), state);
+        PARS_OPTIONS(
+            parse_SingleDeviceOptions(identifier, &cag_context, dev_opts),
+            state);
+        PARS_OPTIONS(parse_ConvolutionOptions(identifier, &cag_context,
+                                              convolution_opts),
+                     state);
+
+        if (identifier == 'h')
+        {
+            printf("Usage: dev_optsdevice [OPTION]...\n");
+            printf("Option name and value should be separated by '=' or a "
+                   "space\n");
+            printf("Demonstrates convolution calculation with two "
+                   "(sub)devices.\n\n");
+            cag_option_print(opts, n, stdout);
+            exit((state == ParseError) ? CL_INVALID_ARG_VALUE : CL_SUCCESS);
+        }
+    }
+
+end:
+    free(opts);
+    return error;
+}
+
+// Host-side implementation of the convolution for verification. Padded input
+// assumed.
+void host_convolution(const cl_float* in, cl_float* out, const cl_float* mask,
+                      const cl_uint x_dim, const cl_uint y_dim)
+{
+    const cl_uint mask_dim = 3;
+    const cl_uint pad_width = mask_dim / 2;
+    const cl_uint in_dim_x = x_dim + pad_width * 2;
+
+    for (cl_uint gid_x = 0; gid_x < x_dim; ++gid_x)
+    {
+        for (cl_uint gid_y = 0; gid_y < y_dim; ++gid_y)
+        {
+            float result = 0.f;
+            for (cl_uint y = 0; y < mask_dim; ++y)
+            {
+                for (cl_uint x = 0; x < mask_dim; ++x)
+                {
+                    result += mask[y * mask_dim + x]
+                        * in[(gid_y + y) * in_dim_x + (gid_x + x)];
+                }
+            }
+            out[gid_y * x_dim + gid_x] = result;
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    cl_int error = CL_SUCCESS;
+    cl_int end_error = CL_SUCCESS;
+    cl_device_id dev;
+    cl_context context;
+    cl_program program;
+    cl_mem dev_input_grid, dev_output_grid, dev_mask;
+
+    cl_kernel convolutions[2] = { 0 };
+    cl_command_queue sub_queues[2] = { 0 };
+    cl_mem sub_input_grids[2] = { 0 };
+    cl_mem sub_output_grids[2] = { 0 };
+    cl_event events[2] = { 0 };
+
+    // Parse command-line options.
+    struct cl_sdk_options_Diagnostic diag_opts = { .quiet = false,
+                                                   .verbose = false };
+
+    // By default assume that there is only one device available.
+    // dev_opts->number is set to 1 so that when calling to cl_util_get_device
+    // for the second device there is no index out of range.
+    struct cl_sdk_options_SingleDevice dev_opts = {
+        .triplet = { 0, 0, CL_DEVICE_TYPE_ALL }
+    };
+    struct convolution_options convolution_opts = { .x_dim = 4096,
+                                                    .y_dim = 4096 };
+
+    OCLERROR_RET(
+        parse_options(argc, argv, &diag_opts, &dev_opts, &convolution_opts),
+        error, end);
+
+    // Create runtime objects based on user preference or default.
+    OCLERROR_PAR(dev = cl_util_get_device(dev_opts.triplet.plat_index,
+                                          dev_opts.triplet.dev_index,
+                                          dev_opts.triplet.dev_type, &error),
+                 error, dev);
+
+    if (!diag_opts.quiet)
+    {
+        cl_util_print_device_info(dev);
+    }
+
+    if (diag_opts.verbose)
+    {
+        printf("Creating sub-devices...");
+        fflush(stdout);
+    }
+
+#if CL_HPP_TARGET_OPENCL_VERSION < 120
+    fprintf(stderr,
+            "Error: OpenCL subdevices not supported before version 1.2 ");
+    exit(EXIT_FAILURE);
+#endif
+
+    // Create subdevices, each with half of the compute units available.
+    cl_uint max_compute_units = 0;
+    OCLERROR_RET(clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS,
+                                 sizeof(cl_uint), &max_compute_units, NULL),
+                 error, dev);
+    cl_device_partition_property subdevices_properties[] = {
+        (cl_device_partition_property)CL_DEVICE_PARTITION_EQUALLY,
+        (cl_device_partition_property)(max_compute_units / 2), 0
+    };
+
+    // Initialize subdevices array with one device and then reallocate for
+    // MacOS and Windows not to complain about NULL subdevices array.
+    cl_uint subdev_count = 1;
+    cl_device_id* subdevices =
+        (cl_device_id*)malloc(subdev_count * sizeof(cl_device_id));
+
+    OCLERROR_RET(clCreateSubDevices(dev, subdevices_properties,
+                                    max_compute_units, subdevices,
+                                    &subdev_count),
+                 error, dev);
+
+    if (subdev_count < 2)
+    {
+        fprintf(stderr, "Error: OpenCL cannot create subdevices");
+        exit(EXIT_FAILURE);
+    }
+
+    subdevices =
+        (cl_device_id*)realloc(subdevices, subdev_count * sizeof(cl_device_id));
+    OCLERROR_RET(clCreateSubDevices(dev, subdevices_properties, subdev_count,
+                                    subdevices, NULL),
+                 error, subdevs);
+
+    OCLERROR_PAR(context = clCreateContext(NULL, subdev_count, subdevices, NULL,
+                                           NULL, &error),
+                 error, subdevs);
+
+    // Read kernel file.
+    const char* kernel_location = "./convolution.cl";
+    char *kernel = NULL, *tmp = NULL;
+    size_t program_size = 0;
+    OCLERROR_PAR(
+        kernel = cl_util_read_text_file(kernel_location, &program_size, &error),
+        error, contx);
+    MEM_CHECK(tmp = (char*)realloc(kernel, program_size), error, ker);
+    kernel = tmp;
+
+    // Compile kernel.
+    if (diag_opts.verbose)
+    {
+        printf("done.\nCompiling kernel...");
+        fflush(stdout);
+    }
+    OCLERROR_PAR(program = clCreateProgramWithSource(
+                     context, 1, (const char**)&kernel, &program_size, &error),
+                 error, ker);
+
+    // If no -cl-std option is specified then the highest 1.x version
+    // supported by each device is used to compile the program. Therefore,
+    // it's only necessary to add the -cl-std option for 2.0 and 3.0 OpenCL
+    // versions.
+    char compiler_options[1023] = "";
+#if CL_HPP_TARGET_OPENCL_VERSION >= 300
+    strcat(compiler_options, "-cl-std=CL3.0 ");
+#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
+    strcat(compiler_options, "-cl-std=CL2.0 ");
+#endif
+
+    OCLERROR_RET(
+        clBuildProgram(program, 2, subdevices, compiler_options, NULL, NULL),
+        error, prg);
+
+    // Initialize host-side storage.
+    const cl_uint mask_dim = 3;
+    const cl_uint pad_width = mask_dim / 2;
+    const size_t x_dim = convolution_opts.x_dim;
+    const size_t y_dim = convolution_opts.y_dim;
+    const size_t pad_x_dim = x_dim + 2 * pad_width;
+    const size_t pad_y_dim = y_dim + 2 * pad_width;
+
+    const size_t input_bytes = sizeof(cl_float) * pad_x_dim * pad_y_dim;
+    const size_t output_bytes = sizeof(cl_float) * x_dim * y_dim;
+    const size_t mask_bytes = sizeof(cl_float) * mask_dim * mask_dim;
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\nInitializing host-side storage...\n");
+        fflush(stdout);
+    }
+
+    // Random number generator.
+    pcg32_random_t rng;
+    pcg32_srandom_r(&rng, 11111, -2222);
+
+    cl_float* h_input_grid;
+    cl_float* h_output_grid;
+    cl_float* h_mask;
+
+    // Initialize input matrix. The input will be padded to remove
+    // conditional branches from the convolution kernel for determining
+    // out-of-bounds.
+    MEM_CHECK(h_input_grid = (cl_float*)malloc(input_bytes), error, prg);
+    if (diag_opts.verbose)
+    {
+        printf("  Generating %zu random numbers for convolution input grid...",
+               x_dim * y_dim);
+        fflush(stdout);
+    }
+    cl_sdk_fill_with_random_ints_range(&rng, (cl_int*)h_input_grid,
+                                       pad_x_dim * pad_y_dim, -1000, 1000);
+    // Fill with 0s the extra rows and columns added for padding.
+    for (size_t y = 0; y < pad_y_dim; ++y)
+    {
+        for (size_t x = 0; x < pad_x_dim; ++x)
+        {
+            if (x == 0 || y == 0 || x == (pad_x_dim - 1)
+                || y == (pad_y_dim - 1))
+            {
+                h_input_grid[y * pad_x_dim + x] = 0;
+            }
+        }
+    }
+
+    // Declare output matrix. Output will not be padded.
+    MEM_CHECK(h_output_grid = (cl_float*)malloc(output_bytes), error, hinput);
+
+    // Initialize convolution mask.
+    MEM_CHECK(h_mask = (cl_float*)malloc(mask_bytes), error, houtput);
+    if (diag_opts.verbose)
+    {
+        printf("done.\n  Generating %u random numbers for convolution mask...",
+               mask_dim * mask_dim);
+        fflush(stdout);
+    }
+    cl_sdk_fill_with_random_ints_range(&rng, (cl_int*)h_mask,
+                                       mask_dim * mask_dim, -1000, 1000);
+
+    // Create device buffers, from which we will create the subbuffers for the
+    // subdevices.
+    const size_t grid_midpoint = y_dim / 2;
+    const size_t pad_grid_midpoint = pad_y_dim / 2;
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\nInitializing device-side storage...");
+        fflush(stdout);
+    }
+
+    OCLERROR_PAR(dev_input_grid =
+                     clCreateBuffer(context,
+                                    CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR
+                                        | CL_MEM_HOST_NO_ACCESS,
+                                    input_bytes, h_input_grid, &error),
+                 error, hmask);
+    OCLERROR_PAR(dev_output_grid =
+                     clCreateBuffer(context,
+                                    CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR
+                                        | CL_MEM_HOST_READ_ONLY,
+                                    output_bytes, NULL, &error),
+                 error, bufin);
+    OCLERROR_PAR(dev_mask =
+                     clCreateBuffer(context,
+                                    CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR
+                                        | CL_MEM_HOST_NO_ACCESS,
+                                    mask_bytes, h_mask, &error),
+                 error, bufout);
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\nSetting up sub-devices...");
+        fflush(stdout);
+    }
+
+    // Set up subdevices for kernel execution.
+    const size_t half_input_bytes =
+        sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint + 1);
+    const size_t input_offset =
+        sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint - 1);
+    const size_t half_output_bytes = sizeof(cl_float) * x_dim * grid_midpoint;
+
+    cl_uint subdevice = 0;
+    for (; subdevice < subdev_count; ++subdevice)
+    {
+        // Create kernel.
+        if (diag_opts.verbose)
+        {
+            printf("\n  Creating kernel and command queue of sub-device %d...",
+                   subdevice);
+            fflush(stdout);
+        }
+
+        OCLERROR_PAR(convolutions[subdevice] =
+                         clCreateKernel(program, "convolution_3x3", &error),
+                     error, bufmask);
+
+        // Initialize queues for command execution on each device.
+#if CL_HPP_TARGET_OPENCL_VERSION >= 200
+        cl_command_queue_properties props[] = { CL_QUEUE_PROPERTIES,
+                                                CL_QUEUE_PROFILING_ENABLE, 0 };
+        OCLERROR_PAR(sub_queues[subdevice] = clCreateCommandQueueWithProperties(
+                         context, subdevices[subdevice], props, &error),
+                     error, conv);
+#else
+        OCLERROR_PAR(sub_queues[subdevice] = clCreateCommandQueue(
+                         context, subdevices[subdevice],
+                         CL_QUEUE_PROFILING_ENABLE, &error),
+                     error, conv);
+#endif
+
+        // Initialize device-side storage.
+        // First device performs the convolution in the upper half and second
+        // device in the lower half (middle borders included).
+        if (diag_opts.verbose)
+        {
+            printf("done.\n  Initializing device-side storage of sub-device "
+                   "%d...",
+                   subdevice);
+            fflush(stdout);
+        }
+
+        cl_buffer_region input_region = { subdevice * input_offset,
+                                          half_input_bytes },
+                         output_region = { subdevice * half_output_bytes,
+                                           half_output_bytes };
+        OCLERROR_PAR(sub_input_grids[subdevice] = clCreateSubBuffer(
+                         dev_input_grid, CL_MEM_READ_ONLY,
+                         CL_BUFFER_CREATE_TYPE_REGION, &input_region, &error),
+                     error, subqueue);
+        OCLERROR_PAR(sub_output_grids[subdevice] = clCreateSubBuffer(
+                         dev_output_grid, CL_MEM_WRITE_ONLY,
+                         CL_BUFFER_CREATE_TYPE_REGION, &output_region, &error),
+                     error, subbufin);
+
+        if (diag_opts.verbose)
+        {
+            printf("done.");
+            fflush(stdout);
+        }
+
+        // Set kernels arguments.
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 0, sizeof(cl_mem),
+                                    &sub_input_grids[subdevice]),
+                     error, subbufout);
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 1, sizeof(cl_mem),
+                                    &sub_output_grids[subdevice]),
+                     error, subbufout);
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 2, sizeof(cl_mem),
+                                    &dev_mask),
+                     error, subbufout);
+        cl_uint2 output_dimensions;
+        output_dimensions.x = (cl_uint)x_dim;
+        output_dimensions.y = (cl_uint)grid_midpoint;
+        OCLERROR_RET(clSetKernelArg(convolutions[subdevice], 3,
+                                    sizeof(cl_uint2), &output_dimensions),
+                     error, subbufout);
+    }
+
+    // Launch kernels.
+    if (diag_opts.verbose)
+    {
+        printf("\nExecuting on device... ");
+        fflush(stdout);
+    }
+
+    // Enqueue kernel calls and wait for them to finish.
+    const size_t* global = (size_t[]){ x_dim, y_dim };
+
+    GET_CURRENT_TIMER(dev_start)
+    OCLERROR_RET(clEnqueueNDRangeKernel(sub_queues[0], convolutions[0], 2, NULL,
+                                        global, NULL, 0, NULL, &events[0]),
+                 error, subbufout);
+    OCLERROR_RET(clEnqueueNDRangeKernel(sub_queues[1], convolutions[1], 2, NULL,
+                                        global, NULL, 0, NULL, &events[1]),
+                 error, event1);
+
+    OCLERROR_RET(clWaitForEvents(1, &events[0]), error, event1);
+    OCLERROR_RET(clWaitForEvents(1, &events[1]), error, event2);
+    GET_CURRENT_TIMER(dev_end)
+    size_t dev_time;
+    TIMER_DIFFERENCE(dev_time, dev_start, dev_end)
+
+    // Compute reference host-side convolution.
+    if (diag_opts.verbose)
+    {
+        printf("done.\nExecuting on host... ");
+        fflush(stdout);
+    }
+
+    GET_CURRENT_TIMER(host_start)
+    host_convolution(h_input_grid, h_output_grid, h_mask, (cl_uint)x_dim, (cl_uint)y_dim);
+    GET_CURRENT_TIMER(host_end)
+    size_t host_time;
+    TIMER_DIFFERENCE(host_time, host_start, host_end)
+
+    if (diag_opts.verbose)
+    {
+        printf("done.\n");
+        fflush(stdout);
+    }
+
+    // Fetch and combine results from devices.
+    cl_float* concatenated_results;
+    const size_t mid_output_count = x_dim * grid_midpoint;
+    const size_t mid_output_bytes = sizeof(cl_float) * mid_output_count;
+    MEM_CHECK(concatenated_results = (cl_float*)malloc(output_bytes), error,
+              event2);
+    for (cl_uint i = 0; i < subdev_count; ++i)
+    {
+        OCLERROR_RET(
+            clEnqueueReadBuffer(sub_queues[i], sub_output_grids[i], CL_BLOCKING,
+                                0, mid_output_bytes,
+                                &concatenated_results[i * mid_output_count], 0,
+                                NULL, NULL),
+            error, result);
+    }
+
+    // Validate device-side solution.
+    cl_float deviation = 0.f;
+    const cl_float tolerance = 1e-6;
+
+    for (size_t i = 0; i < x_dim * y_dim; ++i)
+    {
+        deviation += fabs(concatenated_results[i] - h_output_grid[i]);
+    }
+    deviation /= (x_dim * y_dim);
+
+    if (deviation > tolerance)
+    {
+        printf("Failed convolution! Normalized deviation %.6f between host and "
+               "device exceeds tolerance %.6f\n",
+               deviation, tolerance);
+        fflush(stdout);
+    }
+    else
+    {
+        printf("Successful convolution!\n");
+        fflush(stdout);
+    }
+
+    if (!diag_opts.quiet)
+    {
+        printf("Kernels execution time as seen by host: %llu us.\n",
+               (unsigned long long)(dev_time + 500) / 1000);
+
+        printf("Kernels execution time as measured by devices :\n");
+        printf("\t%llu us.\n",
+               (unsigned long long)(cl_util_get_event_duration(
+                                        events[0], CL_PROFILING_COMMAND_START,
+                                        CL_PROFILING_COMMAND_END, &error)
+                                    + 500)
+                   / 1000);
+        printf("\t%llu us.\n",
+               (unsigned long long)(cl_util_get_event_duration(
+                                        events[1], CL_PROFILING_COMMAND_START,
+                                        CL_PROFILING_COMMAND_END, &error)
+                                    + 500)
+                   / 1000);
+
+        printf("Reference execution as seen by host: %llu us.\n",
+               (unsigned long long)(host_time + 500) / 1000);
+        fflush(stdout);
+    }
+
+result:
+    free(concatenated_results);
+event2:
+    OCLERROR_RET(clReleaseEvent(events[1]), end_error, event1);
+event1:
+    OCLERROR_RET(clReleaseEvent(events[0]), end_error, subbufout);
+subbufout:
+    if (subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseMemObject(sub_output_grids[1]), end_error,
+                     subbufout0);
+    }
+subbufout0:
+    OCLERROR_PAR(clReleaseMemObject(sub_output_grids[0]), end_error, subbufin);
+subbufin:
+    if (subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseMemObject(sub_input_grids[1]), end_error,
+                     subbufin0);
+    }
+subbufin0:
+    OCLERROR_RET(clReleaseMemObject(sub_input_grids[0]), end_error, subqueue);
+subqueue:
+    if (subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseCommandQueue(sub_queues[1]), end_error,
+                     subqueue0);
+    }
+subqueue0:
+    OCLERROR_RET(clReleaseCommandQueue(sub_queues[1]), end_error, conv);
+conv:
+    if (subdevice == 1)
+    {
+        OCLERROR_RET(clReleaseKernel(convolutions[1]), end_error, conv0);
+    }
+conv0:
+    OCLERROR_RET(clReleaseKernel(convolutions[0]), end_error, bufmask);
+bufmask:
+    OCLERROR_RET(clReleaseMemObject(dev_mask), end_error, bufout);
+bufout:
+    OCLERROR_RET(clReleaseMemObject(dev_output_grid), end_error, bufin);
+bufin:
+    OCLERROR_RET(clReleaseMemObject(dev_input_grid), end_error, hmask);
+hmask:
+    free(h_mask);
+houtput:
+    free(h_output_grid);
+hinput:
+    free(h_input_grid);
+prg:
+    OCLERROR_RET(clReleaseProgram(program), end_error, subdevs);
+ker:
+    free(kernel);
+contx:
+    OCLERROR_RET(clReleaseContext(context), end_error, end);
+subdevs:
+    free(subdevices);
+dev:
+    OCLERROR_RET(clReleaseDevice(dev), end_error, end);
+end:
+    if (error) cl_util_print_error(error);
+    return error;
+}
diff --git a/samples/core/multi-device/main.cpp b/samples/core/multi-device/main.cpp
new file mode 100644
index 00000000..6a9cf399
--- /dev/null
+++ b/samples/core/multi-device/main.cpp
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// OpenCL SDK includes.
+#include <CL/SDK/CLI.hpp>
+#include <CL/SDK/Context.hpp>
+#include <CL/SDK/Options.hpp>
+#include <CL/SDK/Random.hpp>
+
+// OpenCL Utils includes.
+#include <CL/Utils/Error.hpp>
+#include <CL/Utils/Event.hpp>
+#include <CL/Utils/Utils.hpp>
+
+// TCLAP includes.
+#include <tclap/CmdLine.h>
+
+// Standard header includes.
+#include <algorithm>
+#include <chrono>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <math.h>
+#include <memory>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <vector>
+
+// Sample-specific options.
+struct ConvolutionOptions
+{
+    cl_uint x_dim;
+    cl_uint y_dim;
+};
+
+// Add option to CLI-parsing SDK utility for input dimensions.
+template <> auto cl::sdk::parse<ConvolutionOptions>()
+{
+    return std::make_tuple(std::make_shared<TCLAP::ValueArg<cl_uint>>(
+                               "x", "x_dim", "x dimension of input", false,
+                               4'096, "positive integral"),
+                           std::make_shared<TCLAP::ValueArg<cl_uint>>(
+                               "y", "y_dim", "y dimension of input", false,
+                               4'096, "positive integral"));
+}
+template <>
+ConvolutionOptions cl::sdk::comprehend<ConvolutionOptions>(
+    std::shared_ptr<TCLAP::ValueArg<cl_uint>> x_dim_arg,
+    std::shared_ptr<TCLAP::ValueArg<cl_uint>> y_dim_arg)
+{
+    return ConvolutionOptions{ x_dim_arg->getValue(), y_dim_arg->getValue() };
+}
+
+// Host-side implementation of the convolution for verification. Padded input
+// assumed.
+void host_convolution(const std::vector<cl_float> in,
+                      std::vector<cl_float>& out,
+                      const std::vector<cl_float> mask, const cl_uint x_dim,
+                      const cl_uint y_dim)
+{
+    constexpr cl_uint mask_dim = 3;
+    constexpr cl_uint pad_width = mask_dim / 2;
+    const cl_uint in_dim_x = x_dim + pad_width * 2;
+
+    for (cl_uint gid_x = 0; gid_x < x_dim; ++gid_x)
+    {
+        for (cl_uint gid_y = 0; gid_y < y_dim; ++gid_y)
+        {
+            float result = 0.f;
+            for (cl_uint y = 0; y < mask_dim; ++y)
+            {
+                for (cl_uint x = 0; x < mask_dim; ++x)
+                {
+                    result += mask[y * mask_dim + x]
+                        * in[(gid_y + y) * in_dim_x + (gid_x + x)];
+                }
+            }
+            out[gid_y * x_dim + gid_x] = result;
+        }
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        // Parse command-line options.
+        auto opts = cl::sdk::parse_cli<cl::sdk::options::Diagnostic,
+                                       cl::sdk::options::SingleDevice,
+                                       ConvolutionOptions>(argc, argv);
+        const auto& diag_opts = std::get<0>(opts);
+        const auto& dev_opts = std::get<1>(opts);
+        const auto& conv_opts = std::get<2>(opts);
+
+        // Create runtime objects based on user preference or default.
+        cl::Device dev = cl::sdk::get_context(dev_opts.triplet)
+                             .getInfo<CL_CONTEXT_DEVICES>()
+                             .at(0);
+
+        cl::Platform platform{
+            dev.getInfo<CL_DEVICE_PLATFORM>()
+        }; // https://github.com/KhronosGroup/OpenCL-CLHPP/issues/150
+
+        if (!diag_opts.quiet)
+        {
+            std::cout << "Selected device: " << dev.getInfo<CL_DEVICE_NAME>()
+                      << "\n"
+                      << "from " << platform.getInfo<CL_PLATFORM_VENDOR>()
+                      << " platform\n"
+                      << std::endl;
+        }
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "Creating sub-devices...";
+            std::cout.flush();
+        }
+
+#if CL_HPP_TARGET_OPENCL_VERSION < 120
+        std::cerr
+            << "Error: OpenCL subdevices not supported before version 1.2 "
+            << std::endl;
+        exit(EXIT_FAILURE);
+#endif
+
+        // Create subdevices, each with half of the compute units available.
+        cl_uint max_compute_units = dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+        cl_device_partition_property subdevices_properties[] = {
+            CL_DEVICE_PARTITION_EQUALLY,
+            static_cast<cl_device_partition_property>(max_compute_units / 2), 0
+        };
+        std::vector<cl::Device> subdevices{};
+        dev.createSubDevices(subdevices_properties, &subdevices);
+
+        if (subdevices.size() < 2)
+        {
+            std::cerr << "Error: OpenCL cannot create subdevices" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+
+        cl::Context context(subdevices);
+
+        // Read kernel file.
+        const char* kernel_location = "./convolution.cl";
+        std::ifstream kernel_stream{ kernel_location };
+        if (!kernel_stream.is_open())
+            throw std::runtime_error{
+                std::string{ "Cannot open kernel source: " } + kernel_location
+            };
+
+        // Compile kernel.
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nCompiling kernel...";
+            std::cout.flush();
+        }
+        cl::Program program(
+            context,
+            std::string{ std::istreambuf_iterator<char>{ kernel_stream },
+                         std::istreambuf_iterator<char>{} });
+
+        // Query device and runtime capabilities.
+        // If no -cl-std option is specified then the highest 1.x version
+        // supported by each device is used to compile the program. Therefore,
+        // it's only necessary to add the -cl-std option for 2.0 and 3.0 OpenCL
+        // versions.
+        cl::string compiler_options;
+        constexpr int max_major_version = 3;
+        for (auto i = 2; i <= max_major_version; ++i)
+        {
+            std::string version_str = std::to_string(i) + "."; // "i."
+            std::string compiler_opt_str =
+                "-cl-std=CL" + std::to_string(i) + ".0 "; // -cl-std=CLi.0
+
+            compiler_options += cl::string{ cl::util::opencl_c_version_contains(
+                                                dev, version_str)
+                                                ? compiler_opt_str
+                                                : "" };
+        }
+        program.build(subdevices, compiler_options.c_str());
+
+        // Initialize host-side storage.
+        constexpr cl_uint mask_dim = 3;
+        constexpr cl_uint pad_width = mask_dim / 2;
+        const cl_uint x_dim = conv_opts.x_dim;
+        const cl_uint y_dim = conv_opts.y_dim;
+        const cl_uint pad_x_dim = x_dim + 2 * pad_width;
+        const cl_uint pad_y_dim = y_dim + 2 * pad_width;
+
+        const size_t input_size = pad_x_dim * pad_y_dim;
+        const size_t output_size = x_dim * y_dim;
+        const size_t mask_size = mask_dim * mask_dim;
+        const size_t input_bytes = sizeof(cl_float) * input_size;
+        const size_t output_bytes = sizeof(cl_float) * output_size;
+        const size_t mask_bytes = sizeof(cl_float) * mask_size;
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nInitializing host-side storage...";
+            std::cout.flush();
+        }
+
+        // Random number generator.
+        auto prng = [engine = std::default_random_engine{},
+                     dist = std::uniform_real_distribution<cl_float>{
+                         -1.0, 1.0 }]() mutable { return dist(engine); };
+
+        // Initialize input matrix. The input will be padded to remove
+        // conditional branches from the convolution kernel for determining
+        // out-of-bounds.
+        std::vector<cl_float> h_input_grid(input_size);
+        if (diag_opts.verbose)
+        {
+            std::cout << "\n  Generating " << output_size
+                      << " random numbers for convolution input grid...";
+            std::cout.flush();
+        }
+        cl::sdk::fill_with_random(prng, h_input_grid);
+
+        // Fill with 0s the extra rows and columns added for padding.
+        for (cl_uint y = 0; y < pad_y_dim; ++y)
+        {
+            for (cl_uint x = 0; x < pad_x_dim; ++x)
+            {
+                if (x == 0 || y == 0 || x == (pad_x_dim - 1)
+                    || y == (pad_y_dim - 1))
+                {
+                    h_input_grid[y * pad_x_dim + x] = 0;
+                }
+            }
+        }
+
+        // Declare output matrix. Output will not be padded.
+        std::vector<cl_float> h_output_grid(output_size, 0);
+
+        // Initialize convolution mask.
+        std::vector<cl_float> h_mask(mask_size);
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.  \nGenerating " << mask_size
+                      << " random numbers for convolution mask...";
+            std::cout.flush();
+        }
+        cl::sdk::fill_with_random(prng, h_mask);
+
+        // Create device buffers, from which we will create the subbuffers for
+        // the subdevices.
+        const cl_uint grid_midpoint = y_dim / 2;
+        const cl_uint pad_grid_midpoint = pad_y_dim / 2;
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nInitializing device-side storage...";
+            std::cout.flush();
+        }
+
+        cl::Buffer dev_input_grid(context,
+                                  CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR
+                                      | CL_MEM_HOST_NO_ACCESS,
+                                  input_bytes, h_input_grid.data());
+        cl::Buffer dev_output_grid(context,
+                                   CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR
+                                       | CL_MEM_HOST_READ_ONLY,
+                                   output_bytes);
+        cl::Buffer dev_mask(context,
+                            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR
+                                | CL_MEM_HOST_NO_ACCESS,
+                            mask_bytes, h_mask.data());
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done.\nSetting up sub-devices...";
+            std::cout.flush();
+        }
+
+        // Set up subdevices for kernel execution.
+        const size_t half_input_bytes =
+            sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint + 1);
+        const size_t input_offset =
+            sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint - 1);
+        const size_t half_output_bytes =
+            sizeof(cl_float) * x_dim * grid_midpoint;
+
+        std::vector<
+            cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl_uint2>>
+            convolutions{};
+        std::vector<cl::CommandQueue> sub_queues{};
+        std::vector<cl::Buffer> sub_input_grids{}, sub_output_grids{};
+
+        for (size_t i = 0; i < subdevices.size(); ++i)
+        {
+            auto subdevice = subdevices[i];
+
+            if (diag_opts.verbose)
+            {
+                std::cout
+                    << "\n  Creating kernel and command queue of sub-device "
+                    << i << "...";
+                std::cout.flush();
+            }
+
+            auto convolution =
+                cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl_uint2>(
+                    program, "convolution_3x3")
+                    .getKernel();
+
+            cl::CommandQueue queue(context, subdevice,
+                                   cl::QueueProperties::Profiling);
+
+            // Initialize device-side storage.
+            // First device performs the convolution in the upper half and
+            // second device in the lower half (middle borders included).
+            if (diag_opts.verbose)
+            {
+                std::cout << "done.\n  Initializing device-side storage of "
+                             "sub-device "
+                          << i << "...";
+                std::cout.flush();
+            }
+
+            cl_buffer_region input_region = { i * input_offset,
+                                              half_input_bytes },
+                             output_region = { i * half_output_bytes,
+                                               half_output_bytes };
+
+            const cl_uint align =
+                subdevice.getInfo<CL_DEVICE_MEM_BASE_ADDR_ALIGN>();
+            if (input_region.origin % align || output_region.origin % align)
+            {
+                std::cerr << "Error: Memory should be aligned to "
+                          << subdevice.getInfo<CL_DEVICE_MEM_BASE_ADDR_ALIGN>()
+                          << std::endl;
+                exit(EXIT_FAILURE);
+            }
+
+            cl::Buffer sub_input_grid = dev_input_grid.createSubBuffer(
+                CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &input_region);
+            cl::Buffer sub_output_grid = dev_output_grid.createSubBuffer(
+                CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &output_region);
+
+            if (diag_opts.verbose)
+            {
+                std::cout << "done.";
+                std::cout.flush();
+            }
+
+            convolutions.push_back(convolution.clone());
+            sub_queues.push_back(queue);
+            sub_input_grids.push_back(sub_input_grid);
+            sub_output_grids.push_back(sub_output_grid);
+        }
+
+        // Launch kernels.
+        if (diag_opts.verbose)
+        {
+            std::cout << "\nExecuting on device... ";
+            std::cout.flush();
+        }
+
+        // Initialize global and local buffers for device execution.
+        const cl::NDRange global{ x_dim, y_dim };
+
+        // Enqueue kernel calls and wait for them to finish.
+        std::vector<cl::Event> dev1_kernel_runs;
+        dev1_kernel_runs.reserve(1);
+        std::vector<cl::Event> dev2_kernel_runs;
+        dev2_kernel_runs.reserve(1);
+        auto dev_start = std::chrono::high_resolution_clock::now();
+
+        dev1_kernel_runs.push_back(convolutions[0](
+            cl::EnqueueArgs{ sub_queues[0], global }, sub_input_grids[0],
+            sub_output_grids[0], dev_mask, { { x_dim, grid_midpoint } }));
+
+        dev2_kernel_runs.push_back(convolutions[1](
+            cl::EnqueueArgs{ sub_queues[1], global }, sub_input_grids[1],
+            sub_output_grids[1], dev_mask, { { x_dim, grid_midpoint } }));
+
+        cl::WaitForEvents(dev1_kernel_runs);
+        cl::WaitForEvents(dev2_kernel_runs);
+        auto dev_end = std::chrono::high_resolution_clock::now();
+
+        // Compute reference host-side convolution.
+        if (diag_opts.verbose)
+        {
+            std::cout << " done.\nExecuting on host... ";
+            std::cout.flush();
+        }
+        auto host_start = std::chrono::high_resolution_clock::now();
+
+        host_convolution(h_input_grid, h_output_grid, h_mask, x_dim, y_dim);
+
+        auto host_end = std::chrono::high_resolution_clock::now();
+
+        if (diag_opts.verbose)
+        {
+            std::cout << "done." << std::endl;
+        }
+
+        // Fetch and combine results from devices.
+        std::vector<cl_float> concatenated_results(output_size);
+        cl::copy(sub_queues.front(), dev_output_grid,
+                 concatenated_results.begin(), concatenated_results.end());
+
+        // Validate device-side solution.
+        cl_float deviation = 0.f;
+        const cl_float tolerance = 1e-6;
+
+        for (size_t i = 0; i < concatenated_results.size(); ++i)
+        {
+            deviation += std::fabs(concatenated_results[i] - h_output_grid[i]);
+        }
+        deviation /= concatenated_results.size();
+
+        if (deviation > tolerance)
+        {
+            std::cerr << "Failed convolution! Normalized deviation "
+                      << deviation
+                      << " between host and device exceeds tolerance "
+                      << tolerance << std::endl;
+        }
+        else
+        {
+            std::cout << "Successful convolution!" << std::endl;
+        }
+
+        if (!diag_opts.quiet)
+        {
+            std::cout << "Kernels execution time as seen by host: "
+                      << std::chrono::duration_cast<std::chrono::microseconds>(
+                             dev_end - dev_start)
+                             .count()
+                      << " us." << std::endl;
+            std::cout << "Kernels execution time as measured by devices: "
+                      << std::endl;
+            for (auto& pass : dev1_kernel_runs)
+                std::cout << "  - "
+                          << cl::util::get_duration<CL_PROFILING_COMMAND_START,
+                                                    CL_PROFILING_COMMAND_END,
+                                                    std::chrono::microseconds>(
+                                 pass)
+                                 .count()
+                          << " us." << std::endl;
+            for (auto& pass : dev2_kernel_runs)
+                std::cout << "  - "
+                          << cl::util::get_duration<CL_PROFILING_COMMAND_START,
+                                                    CL_PROFILING_COMMAND_END,
+                                                    std::chrono::microseconds>(
+                                 pass)
+                                 .count()
+                          << " us." << std::endl;
+            std::cout << "Reference execution as seen by host: "
+                      << std::chrono::duration_cast<std::chrono::microseconds>(
+                             host_end - host_start)
+                             .count()
+                      << " us." << std::endl;
+        }
+    } catch (cl::BuildError& e)
+    {
+        std::cerr << "OpenCL build error: " << e.what() << std::endl;
+        for (auto& build_log : e.getBuildLog())
+        {
+            std::cerr << "\tBuild log for device: "
+                      << build_log.first.getInfo<CL_DEVICE_NAME>() << "\n"
+                      << std::endl;
+            std::cerr << build_log.second << "\n" << std::endl;
+        }
+        std::exit(e.err());
+    } catch (cl::util::Error& e)
+    {
+        std::cerr << "OpenCL utils error: " << e.what() << std::endl;
+        std::exit(e.err());
+    } catch (cl::Error& e)
+    {
+        std::cerr << "OpenCL runtime error: " << e.what() << std::endl;
+        std::exit(e.err());
+    } catch (std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        std::exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}