From 5b2f1d2f2e472e49fb9d24089cd0fe80f25aa4d2 Mon Sep 17 00:00:00 2001 From: Isaac Ault Date: Thu, 12 Sep 2024 14:48:51 +0100 Subject: [PATCH] [SYCL][Bindless] Image Array Sub-Region Copy (#14954) * Add support for sub-region copies of image arrays. * Initial implementation of tests. UR PR: https://github.com/oneapi-src/unified-runtime/pull/1928 --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 12 +- .../sycl_ext_oneapi_bindless_images.asciidoc | 14 +- sycl/source/handler.cpp | 35 +++-- .../array/read_write_1d_subregion.cpp | 145 ++++++++++++++++++ .../array/read_write_2d_subregion.cpp | 145 ++++++++++++++++++ 5 files changed, 332 insertions(+), 19 deletions(-) create mode 100644 sycl/test-e2e/bindless_images/array/read_write_1d_subregion.cpp create mode 100644 sycl/test-e2e/bindless_images/array/read_write_2d_subregion.cpp diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index fce554d30582d..7cbb5cb6101e8 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 24a8299efc59c715a1c2dd180692a5e12a12283a - # Merge: eb63d1a2 2fea679d + # commit 2bbe952669861579ea84fa30f14e1ed27ead0692 + # Merge: d357964a 6b353545 # Author: Omar Ahmed - # Date: Wed Sep 11 10:40:59 2024 +0100 - # Merge pull request #2078 from callumfare/callum/fix_device_extensions_fpga - # Add workaround for silently supported OpenCL extensions on Intel FPGA - set(UNIFIED_RUNTIME_TAG 24a8299efc59c715a1c2dd180692a5e12a12283a) + # Date: Thu Sep 12 11:36:11 2024 +0100 + # Merge pull request #1928 from isaacault/iault/image_array_copy + # [Bindless][Exp] Image Array Sub-Region Copies + set(UNIFIED_RUNTIME_TAG 2bbe952669861579ea84fa30f14e1ed27ead0692) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") # Due to the use of dependentloadflag and no installer for UMF and hwloc we need diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc index 4b0217877c46f..ba3eac3579490 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc @@ -744,7 +744,7 @@ address mode `clamp_to_edge` will be applied for all dimensions. If the performed when sampling along the cube face borders. ==== -=== Explicit copies +=== Explicit copies [[explicit_copies]] ```cpp namespace sycl { @@ -1398,9 +1398,14 @@ As with allocation, the descriptor must be populated appropriately, i.e. === Copying image array data [[copying_image_array_data]] -When copying to or from image arrays, the user should copy to/from the entire -array of images in one call to `ext_oneapi_copy` by passing the image arrays' -`image_mem_handle`. +When copying to or from image arrays, the user should utilize `ext_oneapi_copy` +and pass the image arrays' `image_mem_handle`, and any applicable sub-region +copy parameters, as outlined in <>. + +In order to copy to specific layers of an image array, the offset and extent +parameters involved in sub-region copies must be populated such that the 3rd +dimension of the ranges represent the arrays' layer(s) being copied, regardless +of whether the copy is performed on a 1D or 2D image array. === Reading an image array @@ -2888,4 +2893,5 @@ These features still need to be handled: `map_external_linear_memory`. |6 |2024-08-05 | - Collated all changes since revision 5. - Bumped SYCL_EXT_ONEAPI_BINDLESS_IMAGES to number 6. +|6.1|2024-09-09| - Update for image-array sub-region copy support. |====================== diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 50e7d007e537e..49e6ddd72e92d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#include "ur_api.h" #include "sycl/detail/helpers.hpp" +#include "ur_api.h" #include #include @@ -1046,10 +1046,15 @@ void handler::ext_oneapi_copy( Desc.type == sycl::ext::oneapi::experimental::image_type::cubemap ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP : UrDesc.type; + + // Array size is depth extent. + impl->MCopyExtent = {Desc.width, Desc.height, Desc.array_size}; } else { UrDesc.type = Desc.depth > 0 ? UR_MEM_TYPE_IMAGE3D : (Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D : UR_MEM_TYPE_IMAGE1D); + + impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth}; } ur_image_format_t UrFormat; @@ -1061,7 +1066,6 @@ void handler::ext_oneapi_copy( impl->MSrcOffset = {0, 0, 0}; impl->MDestOffset = {0, 0, 0}; - impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth}; impl->MSrcImageDesc = UrDesc; impl->MDstImageDesc = UrDesc; impl->MSrcImageFormat = UrFormat; @@ -1136,7 +1140,7 @@ void handler::ext_oneapi_copy( sycl_ext_oneapi_bindless_images>(); Desc.verify(); - MSrcPtr = reinterpret_cast(Src.raw_handle); + MSrcPtr = reinterpret_cast(Src.raw_handle); MDstPtr = Dest; ur_image_desc_t UrDesc = {}; @@ -1156,10 +1160,15 @@ void handler::ext_oneapi_copy( Desc.type == sycl::ext::oneapi::experimental::image_type::cubemap ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP : UrDesc.type; + + // Array size is depth extent. + impl->MCopyExtent = {Desc.width, Desc.height, Desc.array_size}; } else { UrDesc.type = Desc.depth > 0 ? UR_MEM_TYPE_IMAGE3D : (Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D : UR_MEM_TYPE_IMAGE1D); + + impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth}; } ur_image_format_t UrFormat; @@ -1171,7 +1180,6 @@ void handler::ext_oneapi_copy( impl->MSrcOffset = {0, 0, 0}; impl->MDestOffset = {0, 0, 0}; - impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth}; impl->MSrcImageDesc = UrDesc; impl->MDstImageDesc = UrDesc; impl->MSrcImageFormat = UrFormat; @@ -1189,8 +1197,8 @@ void handler::ext_oneapi_copy( sycl_ext_oneapi_bindless_images>(); ImageDesc.verify(); - MSrcPtr = reinterpret_cast(Src.raw_handle); - MDstPtr = reinterpret_cast(Dest.raw_handle); + MSrcPtr = reinterpret_cast(Src.raw_handle); + MDstPtr = reinterpret_cast(Dest.raw_handle); ur_image_desc_t UrDesc = {}; UrDesc.stype = UR_STRUCTURE_TYPE_IMAGE_DESC; @@ -1208,11 +1216,17 @@ void handler::ext_oneapi_copy( ImageDesc.type == sycl::ext::oneapi::experimental::image_type::cubemap ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP : UrDesc.type; + + // Array size is depth extent. + impl->MCopyExtent = {ImageDesc.width, ImageDesc.height, + ImageDesc.array_size}; } else { UrDesc.type = ImageDesc.depth > 0 ? UR_MEM_TYPE_IMAGE3D : (ImageDesc.height > 0 ? UR_MEM_TYPE_IMAGE2D : UR_MEM_TYPE_IMAGE1D); + + impl->MCopyExtent = {ImageDesc.width, ImageDesc.height, ImageDesc.depth}; } ur_image_format_t UrFormat; @@ -1224,7 +1238,6 @@ void handler::ext_oneapi_copy( impl->MSrcOffset = {0, 0, 0}; impl->MDestOffset = {0, 0, 0}; - impl->MCopyExtent = {ImageDesc.width, ImageDesc.height, ImageDesc.depth}; impl->MSrcImageDesc = UrDesc; impl->MDstImageDesc = UrDesc; impl->MSrcImageFormat = UrFormat; @@ -1244,7 +1257,7 @@ void handler::ext_oneapi_copy( sycl_ext_oneapi_bindless_images>(); SrcImgDesc.verify(); - MSrcPtr = reinterpret_cast(Src.raw_handle); + MSrcPtr = reinterpret_cast(Src.raw_handle); MDstPtr = Dest; ur_image_desc_t UrDesc = {}; @@ -1320,10 +1333,15 @@ void handler::ext_oneapi_copy( Desc.type == sycl::ext::oneapi::experimental::image_type::cubemap ? UR_MEM_TYPE_IMAGE_CUBEMAP_EXP : UrDesc.type; + + // Array size is depth extent. + impl->MCopyExtent = {Desc.width, Desc.height, Desc.array_size}; } else { UrDesc.type = Desc.depth > 0 ? UR_MEM_TYPE_IMAGE3D : (Desc.height > 0 ? UR_MEM_TYPE_IMAGE2D : UR_MEM_TYPE_IMAGE1D); + + impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth}; } ur_image_format_t UrFormat; @@ -1335,7 +1353,6 @@ void handler::ext_oneapi_copy( impl->MSrcOffset = {0, 0, 0}; impl->MDestOffset = {0, 0, 0}; - impl->MCopyExtent = {Desc.width, Desc.height, Desc.depth}; impl->MSrcImageDesc = UrDesc; impl->MDstImageDesc = UrDesc; impl->MSrcImageFormat = UrFormat; diff --git a/sycl/test-e2e/bindless_images/array/read_write_1d_subregion.cpp b/sycl/test-e2e/bindless_images/array/read_write_1d_subregion.cpp new file mode 100644 index 0000000000000..4815661efc2d2 --- /dev/null +++ b/sycl/test-e2e/bindless_images/array/read_write_1d_subregion.cpp @@ -0,0 +1,145 @@ +// REQUIRES: cuda + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +#include + +// Uncomment to print additional test information +// #define VERBOSE_PRINT + +class image_addition; + +int main() { + + sycl::device dev; + sycl::queue q(dev); + auto ctxt = q.get_context(); + + // declare image data + size_t width = 4; + size_t layers = 2; + size_t N = width * layers; + std::vector out(N); + std::vector expected(N); + std::vector dataIn1(N); + std::vector dataIn2(N); + for (int i = 0; i < width; i++) { + for (int j = 0; j < layers; j++) { + expected[j + ((layers)*i)] = (j + (layers)*i) * 3; + dataIn1[j + ((layers)*i)] = (j + (layers)*i); + dataIn2[j + ((layers)*i)] = (j + (layers)*i) * 2; + } + } + + // Image descriptor - can use the same for both images + sycl::ext::oneapi::experimental::image_descriptor desc( + {width}, 1, sycl::image_channel_type::fp32, + sycl::ext::oneapi::experimental::image_type::array, 1, layers); + + try { + // Extension: allocate memory on device and create the handle + sycl::ext::oneapi::experimental::image_mem imgMem0(desc, q); + sycl::ext::oneapi::experimental::image_mem imgMem1(desc, q); + sycl::ext::oneapi::experimental::image_mem imgMem2(desc, q); + + // Extension: create the image and return the handle + sycl::ext::oneapi::experimental::unsampled_image_handle imgHandle1 = + sycl::ext::oneapi::experimental::create_image(imgMem0, desc, q); + sycl::ext::oneapi::experimental::unsampled_image_handle imgHandle2 = + sycl::ext::oneapi::experimental::create_image(imgMem1, desc, q); + sycl::ext::oneapi::experimental::unsampled_image_handle imgHandle3 = + sycl::ext::oneapi::experimental::create_image(imgMem2, desc, q); + + // The subregion size for the copies. + sycl::range copyExtent = {width / 2, 1, layers / 2}; + // The extent of data provided on the host (vector). + sycl::range srcExtent = {width, 1, layers}; + + // the 4 subregion offsets used for the copies. + std::vector> offsets{{0, 0, 0}, + {width / 2, 0, 0}, + {0, 0, layers / 2}, + {width / 2, 0, layers / 2}}; + + for (auto offset : offsets) { + // Extension: Copy to image array subregion. + q.ext_oneapi_copy(dataIn1.data(), offset, srcExtent, imgMem0.get_handle(), + offset, desc, copyExtent); + // Extension: Copy to image array subregion. + q.ext_oneapi_copy(dataIn2.data(), offset, srcExtent, imgMem1.get_handle(), + offset, desc, copyExtent); + } + q.wait_and_throw(); + + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<2>{{width, layers}, {width, layers}}, + [=](sycl::nd_item<2> it) { + size_t dim0 = it.get_local_id(0); + size_t dim1 = it.get_local_id(1); + float sum = 0; + // Extension: fetch image data from handle + float px1 = + sycl::ext::oneapi::experimental::fetch_image_array( + imgHandle1, int(dim0), dim1); + float px2 = + sycl::ext::oneapi::experimental::fetch_image_array( + imgHandle2, int(dim0), dim1); + + sum = px1 + px2; + + // Extension: write to image with handle + sycl::ext::oneapi::experimental::write_image_array( + imgHandle3, int(dim0), dim1, sum); + }); + }); + q.wait_and_throw(); + + // Extension: copy data from device to host (four subregions/quadrants) + for (auto offset : offsets) { + q.ext_oneapi_copy(imgMem2.get_handle(), offset, desc, out.data(), offset, + srcExtent, copyExtent); + } + q.wait_and_throw(); + + // Extension: cleanup + sycl::ext::oneapi::experimental::destroy_image_handle(imgHandle1, q); + sycl::ext::oneapi::experimental::destroy_image_handle(imgHandle2, q); + } catch (sycl::exception e) { + std::cerr << "SYCL exception caught! : " << e.what() << "\n"; + return 1; + } catch (...) { + std::cerr << "Unknown exception caught!\n"; + return 2; + } + + // collect and validate output + bool validated = true; + for (int i = 0; i < N; i++) { + bool mismatch = false; + if (out[i] != expected[i]) { + mismatch = true; + validated = false; + } + + if (mismatch) { +#ifdef VERBOSE_PRINT + std::cout << "Result mismatch! Expected: " << expected[i] + << ", Actual: " << out[i] << std::endl; +#else + break; +#endif + } + } + if (validated) { + std::cout << "Test passed!" << std::endl; + return 0; + } + + std::cout << "Test failed!" << std::endl; + return 3; +} diff --git a/sycl/test-e2e/bindless_images/array/read_write_2d_subregion.cpp b/sycl/test-e2e/bindless_images/array/read_write_2d_subregion.cpp new file mode 100644 index 0000000000000..1b72a57bed47c --- /dev/null +++ b/sycl/test-e2e/bindless_images/array/read_write_2d_subregion.cpp @@ -0,0 +1,145 @@ +// REQUIRES: cuda + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +#include + +// Uncomment to print additional test information +// #define VERBOSE_PRINT + +class image_addition; + +int main() { + + sycl::device dev; + sycl::queue q(dev); + auto ctxt = q.get_context(); + + // declare image data + size_t width = 6; + size_t height = 4; + size_t layers = 2; + size_t N = width * height * layers; + std::vector out(N); + std::vector expected(N); + std::vector dataIn(N); + // ROW-MAJOR + for (int i = 0; i < width; i++) { + for (int j = 0; j < height; j++) { + for (int k = 0; k < layers; k++) { + expected[k + (layers) * (j + (height)*i)] = + (k + (layers) * (j + (height)*i)) * 2; + dataIn[k + (layers) * (j + (height)*i)] = + k + (layers) * (j + (height)*i); + } + } + } + + try { + + // Extension: image descriptor - can use the same for both images + sycl::ext::oneapi::experimental::image_descriptor desc( + {width, height}, 1, sycl::image_channel_type::fp32, + sycl::ext::oneapi::experimental::image_type::array, 1, layers); + + // Extension: allocate memory on device and create the handle + sycl::ext::oneapi::experimental::image_mem imgMem0(desc, q); + sycl::ext::oneapi::experimental::image_mem imgMem1(desc, q); + + // The subregion size for the copies. + sycl::range copyExtent = {width / 2, height / 2, layers / 2}; + // The extent of data provided on the host (vector). + sycl::range srcExtent = {width, height, layers}; + + // the 4 subregion offsets used for the copies. + std::vector> offsets{{0, 0, 0}, + {width / 2, 0, 0}, + {0, height / 2, 0}, + {0, 0, layers / 2}, + {width / 2, height / 2, 0}, + {width / 2, 0, layers / 2}, + {0, height / 2, layers / 2}, + {width / 2, height / 2, layers / 2}}; + + for (auto offset : offsets) { + // Extension: Copy to image array subregion. + q.ext_oneapi_copy(dataIn.data(), offset, srcExtent, imgMem0.get_handle(), + offset, desc, copyExtent); + } + q.wait_and_throw(); + + // Extension: create the image and return the handle + sycl::ext::oneapi::experimental::unsampled_image_handle imgHandle1 = + sycl::ext::oneapi::experimental::create_image(imgMem0, desc, q); + sycl::ext::oneapi::experimental::unsampled_image_handle imgHandle2 = + sycl::ext::oneapi::experimental::create_image(imgMem1, desc, q); + + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>{{width, height, layers}, {width, height, layers}}, + [=](sycl::nd_item<3> it) { + size_t dim0 = it.get_global_id(0); + size_t dim1 = it.get_global_id(1); + size_t dim2 = it.get_global_id(2); + float sum = 0; + // Extension: fetch image data from handle + float px1 = + sycl::ext::oneapi::experimental::fetch_image_array( + imgHandle1, sycl::int2(dim0, dim1), dim2); + + // Extension: write to image with handle + sum = px1 + px1; + sycl::ext::oneapi::experimental::write_image_array( + imgHandle2, sycl::int2(dim0, dim1), dim2, sum); + }); + }); + q.wait_and_throw(); + + // Extension: copy data from device to host (four subregions/quadrants) + for (auto offset : offsets) { + q.ext_oneapi_copy(imgMem1.get_handle(), offset, desc, out.data(), offset, + srcExtent, copyExtent); + } + q.wait_and_throw(); + + // Extension: cleanup + sycl::ext::oneapi::experimental::destroy_image_handle(imgHandle1, q); + } catch (sycl::exception e) { + std::cerr << "SYCL exception caught! : " << e.what() << "\n"; + return 1; + } catch (...) { + std::cerr << "Unknown exception caught!\n"; + return 2; + } + + // collect and validate output + bool validated = true; + for (int i = 0; i < N; i++) { + bool mismatch = false; + if (out[i] != expected[i]) { + mismatch = true; + validated = false; + } + + if (mismatch) { +#ifdef VERBOSE_PRINT + std::cout << "Result mismatch! Expected: " << expected[i] + << ", Actual: " << out[i] << std::endl; +#else + break; +#endif + } + } + + if (validated) { + std::cout << "Test passed!" << std::endl; + return 0; + } + + std::cout << "Test failed!" << std::endl; + return 3; +}