From f322f723cdbf94ec26579c0fd597542b1cb9fe6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= Date: Fri, 28 Jun 2024 03:17:52 -0700 Subject: [PATCH] [DFT][Examples] Make the oneMKL DFT example consistent with the other domains (#518) --- examples/README.md | 3 +- .../compile_time_dispatching/CMakeLists.txt | 34 +++-- .../complex_fwd_buffer_mklcpu.cpp | 132 ------------------ ...u.cpp => complex_fwd_usm_mklcpu_cufft.cpp} | 94 ++++++++++--- .../dft/run_time_dispatching/CMakeLists.txt | 5 +- 5 files changed, 95 insertions(+), 173 deletions(-) delete mode 100644 examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp rename examples/dft/compile_time_dispatching/{complex_fwd_buffer_mklgpu.cpp => complex_fwd_usm_mklcpu_cufft.cpp} (53%) diff --git a/examples/README.md b/examples/README.md index 9904a78f2..0dad8772d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -3,7 +3,7 @@ oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the followin - blas: level3/gemm_usm - rng: uniform_usm - lapack: getrs_usm -- dft: complex_fwd_buffer, real_fwd_usm +- dft: complex_fwd_usm, real_fwd_usm - sparse_blas: sparse_gemv_usm Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively. @@ -11,7 +11,6 @@ Each routine has one run-time dispatching example and one compile-time dispatchi To build examples, use cmake build option `-DBUILD_EXAMPLES=true`. Compile_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and cuda backend is enabled, because the compile-time dispatching example runs on both mklcpu and cuda backends. Run_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and `-DBUILD_SHARED_LIBS=true`. -All DFT examples require the mklgpu backend to be enabled. The example executable naming convention follows `example_<$domain>_<$routine>_<$backend>` for compile-time dispatching examples or `example_<$domain>_<$routine>` for run-time dispatching examples. diff --git a/examples/dft/compile_time_dispatching/CMakeLists.txt b/examples/dft/compile_time_dispatching/CMakeLists.txt index 0cddd8f5f..704964af7 100644 --- a/examples/dft/compile_time_dispatching/CMakeLists.txt +++ b/examples/dft/compile_time_dispatching/CMakeLists.txt @@ -18,31 +18,35 @@ #=============================================================================== #Build object from all sources -set(DFTI_CT_BACKENDS "") - -if(ENABLE_MKLGPU_BACKEND) - list(APPEND DFTI_CT_BACKENDS "mklgpu") -endif() - -if(ENABLE_MKLCPU_BACKEND) - list(APPEND DFTI_CT_BACKENDS "mklcpu") +set(DFT_CT_SOURCES "") +if (ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND) + list(APPEND DFT_CT_SOURCES "complex_fwd_usm_mklcpu_cufft") endif() include(WarningsUtils) -foreach(dfti_backend ${DFTI_CT_BACKENDS}) - set(EXAMPLE_NAME example_dft_complex_fwd_buffer_${dfti_backend}) - add_executable(${EXAMPLE_NAME} complex_fwd_buffer_${dfti_backend}.cpp) +foreach(dft_ct_source ${DFT_CT_SOURCES}) + set(EXAMPLE_NAME example_${domain}_${dft_ct_source}) + add_executable(${EXAMPLE_NAME} ${dft_ct_source}.cpp) target_include_directories(${EXAMPLE_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/examples/include PUBLIC ${PROJECT_SOURCE_DIR}/include PUBLIC ${CMAKE_BINARY_DIR}/bin ) - add_dependencies(${EXAMPLE_NAME} onemkl_dft_${dfti_backend}) - target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_dft_${dfti_backend} onemkl_warnings) +if(domain STREQUAL "dft" AND ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND) + add_dependencies(${EXAMPLE_NAME} onemkl_${domain}_mklcpu onemkl_${domain}_cufft) + list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklcpu onemkl_${domain}_cufft) +endif() + +target_link_libraries(${EXAMPLE_NAME} PUBLIC + ${ONEMKL_LIBRARIES_${domain}} + ONEMKL::SYCL::SYCL + onemkl_warnings + ) # Register example as ctest - add_test(NAME dft/EXAMPLE/CT/complex_fwd_buffer_${dfti_backend} COMMAND ${EXAMPLE_NAME}) -endforeach(dfti_backend) + add_test(NAME dft/EXAMPLE/CT/${dft_ct_source} COMMAND ${EXAMPLE_NAME}) + +endforeach(dft_ct_source) diff --git a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp b/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp deleted file mode 100644 index cb6e85ffa..000000000 --- a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/******************************************************************************* -* Copyright 2023 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions -* and limitations under the License. -* -* -* SPDX-License-Identifier: Apache-2.0 -*******************************************************************************/ - -// STL includes -#include - -// oneMKL/SYCL includes -#if __has_include() -#include -#else -#include -#endif -#include "oneapi/mkl.hpp" - -void run_example(const sycl::device& cpu_device) { - constexpr int N = 10; - - // Catch asynchronous exceptions for cpu - auto cpu_error_handler = [&](sycl::exception_list exceptions) { - for (auto const& e : exceptions) { - try { - std::rethrow_exception(e); - } - catch (sycl::exception const& e) { - // Handle not dft related exceptions that happened during asynchronous call - std::cerr << "Caught asynchronous SYCL exception:" << std::endl; - std::cerr << "\t" << e.what() << std::endl; - } - } - std::exit(2); - }; - - sycl::queue cpu_queue(cpu_device, cpu_error_handler); - - std::vector> input_data(N); - std::vector> output_data(N); - - // enabling - // 1. create descriptors - oneapi::mkl::dft::descriptor - desc(N); - - // 2. variadic set_value - desc.set_value(oneapi::mkl::dft::config_param::PLACEMENT, - oneapi::mkl::dft::config_value::NOT_INPLACE); - desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, - static_cast(1)); - - // 3. commit_descriptor (compile_time MKLCPU) - desc.commit(oneapi::mkl::backend_selector{ cpu_queue }); - - // 4. compute_forward / compute_backward (MKLCPU) - { - sycl::buffer> input_buffer(input_data.data(), sycl::range<1>(N)); - sycl::buffer> output_buffer(output_data.data(), sycl::range<1>(N)); - oneapi::mkl::dft::compute_forward, - std::complex>(desc, input_buffer, output_buffer); - } -} - -// -// Description of example setup, apis used and supported floating point type precisions -// -void print_example_banner() { - std::cout << "\n" - "########################################################################\n" - "# Complex out-of-place forward transform for Buffer API's example:\n" - "#\n" - "# Using APIs:\n" - "# Compile-time dispatch API\n" - "# Buffer forward complex out-of-place\n" - "#\n" - "# Using double precision (double) data type\n" - "#\n" - "# For Intel CPU with Intel MKLCPU backend.\n" - "#\n" - "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify\n" - "# available devices\n" - "########################################################################\n" - << std::endl; -} - -// -// Main entry point for example. -// -int main() { - print_example_banner(); - - try { - sycl::device cpu_device((sycl::cpu_selector_v)); - std::cout << "Running DFT Complex forward out-of-place buffer example" << std::endl; - std::cout << "Using compile-time dispatch API with MKLCPU." << std::endl; - std::cout << "Running with double precision real data type on:" << std::endl; - std::cout << "\tCPU device :" << cpu_device.get_info() - << std::endl; - - run_example(cpu_device); - std::cout << "DFT Complex USM example ran OK on MKLCPU" << std::endl; - } - catch (sycl::exception const& e) { - // Handle not dft related exceptions that happened during synchronous call - std::cerr << "Caught synchronous SYCL exception:" << std::endl; - std::cerr << "\t" << e.what() << std::endl; - std::cerr << "\tSYCL error code: " << e.code().value() << std::endl; - return 1; - } - catch (std::exception const& e) { - // Handle not SYCL related exceptions that happened during synchronous call - std::cerr << "Caught synchronous std::exception:" << std::endl; - std::cerr << "\t" << e.what() << std::endl; - return 1; - } - - return 0; -} diff --git a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklgpu.cpp b/examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp similarity index 53% rename from examples/dft/compile_time_dispatching/complex_fwd_buffer_mklgpu.cpp rename to examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp index 4c243569b..59c810f3f 100644 --- a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklgpu.cpp +++ b/examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Intel Corporation +* Copyright 2024 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,10 +27,26 @@ #include #endif #include "oneapi/mkl.hpp" +#include -void run_example(const sycl::device& gpu_device) { +void run_example(const sycl::device& cpu_device, const sycl::device& gpu_device) { constexpr std::size_t N = 10; + // Catch asynchronous exceptions for cpu + auto cpu_error_handler = [&](sycl::exception_list exceptions) { + for (auto const& e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (sycl::exception const& e) { + // Handle not dft related exceptions that happened during asynchronous call + std::cerr << "Caught asynchronous SYCL exception on CPU device during execution:" + << std::endl; + std::cerr << "\t" << e.what() << std::endl; + } + } + std::exit(2); + }; // Catch asynchronous exceptions for gpu auto gpu_error_handler = [&](sycl::exception_list exceptions) { for (auto const& e : exceptions) { @@ -39,17 +55,30 @@ void run_example(const sycl::device& gpu_device) { } catch (sycl::exception const& e) { // Handle not dft related exceptions that happened during asynchronous call - std::cerr << "Caught asynchronous SYCL exception:" << std::endl; + std::cerr << "Caught asynchronous SYCL exception on GPU device during execution:" + << std::endl; std::cerr << "\t" << e.what() << std::endl; } } std::exit(2); }; + // Preparation CPU device and GPU device + sycl::queue cpu_queue(cpu_device, cpu_error_handler); sycl::queue gpu_queue(gpu_device, gpu_error_handler); - std::vector> input_data(N); - std::vector> output_data(N); + // allocate on CPU device and GPU device + auto cpu_input_data = sycl::malloc_shared>(N, cpu_queue); + auto cpu_output_data = sycl::malloc_shared>(N, cpu_queue); + + auto gpu_input_data = sycl::malloc_shared>(N, gpu_queue); + auto gpu_output_data = sycl::malloc_shared>(N, gpu_queue); + + // Initialize input data + for (std::size_t i = 0; i < N; ++i) { + cpu_input_data[i] = { static_cast(i), static_cast(-i) }; + gpu_input_data[i] = { static_cast(i), static_cast(-i) }; + } // enabling // 1. create descriptors @@ -63,16 +92,27 @@ void run_example(const sycl::device& gpu_device) { desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, static_cast(1)); - // 3. commit_descriptor (compile_time MKLGPU) - desc.commit(oneapi::mkl::backend_selector{ gpu_queue }); + // 3a. commit_descriptor (compile_time MKLCPU) + desc.commit(oneapi::mkl::backend_selector{ cpu_queue }); - // 4. compute_forward / compute_backward (MKLGPU) - { - sycl::buffer> input_buffer(input_data.data(), sycl::range<1>(N)); - sycl::buffer> output_buffer(output_data.data(), sycl::range<1>(N)); - oneapi::mkl::dft::compute_forward, std::complex>( - desc, input_buffer, output_buffer); - } + // 4a. compute_forward / compute_backward (MKLCPU) + oneapi::mkl::dft::compute_forward, std::complex>( + desc, cpu_input_data, cpu_output_data); + + // 3b. commit_descriptor (compile_time cuFFT) + desc.commit(oneapi::mkl::backend_selector{ gpu_queue }); + + // 4b. compute_forward / compute_backward (cuFFT) + oneapi::mkl::dft::compute_forward, std::complex>( + desc, gpu_input_data, gpu_output_data); + + cpu_queue.wait_and_throw(); + gpu_queue.wait_and_throw(); + + sycl::free(cpu_input_data, cpu_queue); + sycl::free(gpu_input_data, gpu_queue); + sycl::free(cpu_output_data, cpu_queue); + sycl::free(gpu_output_data, gpu_queue); } // @@ -81,18 +121,16 @@ void run_example(const sycl::device& gpu_device) { void print_example_banner() { std::cout << "\n" "########################################################################\n" - "# Complex out-of-place forward transform for Buffer API's example:\n" + "# Complex out-of-place forward transform for USM API's example:\n" "#\n" "# Using APIs:\n" "# Compile-time dispatch API\n" - "# Buffer forward complex out-of-place\n" + "# USM forward complex out-of-place\n" "#\n" "# Using single precision (float) data type\n" "#\n" - "# For Intel GPU with Intel MKLGPU backend.\n" + "# Running on both Intel CPU and NVIDIA GPU devices.\n" "#\n" - "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify\n" - "# available devices\n" "########################################################################\n" << std::endl; } @@ -104,15 +142,25 @@ int main(int /*argc*/, char** /*argv*/) { print_example_banner(); try { + sycl::device cpu_device((sycl::cpu_selector_v)); sycl::device gpu_device((sycl::gpu_selector_v)); - std::cout << "Running DFT Complex forward out-of-place buffer example" << std::endl; - std::cout << "Using compile-time dispatch API with MKLGPU." << std::endl; + + unsigned int vendor_id = gpu_device.get_info(); + if (vendor_id != NVIDIA_ID) { + std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl; + return 1; + } + + std::cout << "Running DFT Complex forward out-of-place usm example" << std::endl; + std::cout << "Using compile-time dispatch API with MKLCPU and cuFFT." << std::endl; std::cout << "Running with single precision real data type on:" << std::endl; + std::cout << "\tCPU device: " << cpu_device.get_info() + << std::endl; std::cout << "\tGPU device :" << gpu_device.get_info() << std::endl; - run_example(gpu_device); - std::cout << "DFT Complex USM example ran OK on MKLGPU" << std::endl; + run_example(cpu_device, gpu_device); + std::cout << "DFT Complex USM example ran OK on MKLCPU and CUFFT" << std::endl; } catch (sycl::exception const& e) { // Handle not dft related exceptions that happened during synchronous call diff --git a/examples/dft/run_time_dispatching/CMakeLists.txt b/examples/dft/run_time_dispatching/CMakeLists.txt index 6d9a8dd24..e221c7950 100644 --- a/examples/dft/run_time_dispatching/CMakeLists.txt +++ b/examples/dft/run_time_dispatching/CMakeLists.txt @@ -27,13 +27,16 @@ set(DFT_RT_SOURCES "") # If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to # overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend set(DEVICE_FILTERS "") -if(ENABLE_MKLGPU_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_ROCFFT_BACKEND OR ENABLE_PORTFFT_BACKEND) +if(ENABLE_MKLGPU_BACKEND OR ENABLE_MKLCPU_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_ROCFFT_BACKEND OR ENABLE_PORTFFT_BACKEND) list(APPEND DFT_RT_SOURCES "real_fwd_usm") endif() if(ENABLE_MKLGPU_BACKEND) list(APPEND DEVICE_FILTERS "level_zero:gpu") endif() +if(ENABLE_MKLCPU_BACKEND) + list(APPEND DEVICE_FILTERS "opencl:cpu") +endif() if(ENABLE_PORTFFT_BACKEND) list(APPEND DEVICE_FILTERS "*:gpu") endif()