From f322f723cdbf94ec26579c0fd597542b1cb9fe6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= <nicolo.scipione@codeplay.com>
Date: Fri, 28 Jun 2024 03:17:52 -0700
Subject: [PATCH] [DFT][Examples] Make the oneMKL DFT example consistent with
 the other domains (#518)

---
 examples/README.md                            |   3 +-
 .../compile_time_dispatching/CMakeLists.txt   |  34 +++--
 .../complex_fwd_buffer_mklcpu.cpp             | 132 ------------------
 ...u.cpp => complex_fwd_usm_mklcpu_cufft.cpp} |  94 ++++++++++---
 .../dft/run_time_dispatching/CMakeLists.txt   |   5 +-
 5 files changed, 95 insertions(+), 173 deletions(-)
 delete mode 100644 examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp
 rename examples/dft/compile_time_dispatching/{complex_fwd_buffer_mklgpu.cpp => complex_fwd_usm_mklcpu_cufft.cpp} (53%)

diff --git a/examples/README.md b/examples/README.md
index 9904a78f2..0dad8772d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,7 +3,7 @@ oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the followin
 - blas: level3/gemm_usm  
 - rng: uniform_usm  
 - lapack: getrs_usm
-- dft: complex_fwd_buffer, real_fwd_usm
+- dft: complex_fwd_usm, real_fwd_usm
 - sparse_blas: sparse_gemv_usm
 
 Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively.
@@ -11,7 +11,6 @@ Each routine has one run-time dispatching example and one compile-time dispatchi
 To build examples, use cmake build option `-DBUILD_EXAMPLES=true`.  
 Compile_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and cuda backend is enabled, because the compile-time dispatching example runs on both mklcpu and cuda backends.
 Run_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and `-DBUILD_SHARED_LIBS=true`.
-All DFT examples require the mklgpu backend to be enabled.
 
 The example executable naming convention follows `example_<$domain>_<$routine>_<$backend>` for compile-time dispatching examples 
   or `example_<$domain>_<$routine>` for run-time dispatching examples. 
diff --git a/examples/dft/compile_time_dispatching/CMakeLists.txt b/examples/dft/compile_time_dispatching/CMakeLists.txt
index 0cddd8f5f..704964af7 100644
--- a/examples/dft/compile_time_dispatching/CMakeLists.txt
+++ b/examples/dft/compile_time_dispatching/CMakeLists.txt
@@ -18,31 +18,35 @@
 #===============================================================================
 
 #Build object from all sources
-set(DFTI_CT_BACKENDS "")
-
-if(ENABLE_MKLGPU_BACKEND)
-  list(APPEND DFTI_CT_BACKENDS "mklgpu")
-endif()
-
-if(ENABLE_MKLCPU_BACKEND)
-  list(APPEND DFTI_CT_BACKENDS "mklcpu")
+set(DFT_CT_SOURCES "")
+if (ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND)
+  list(APPEND DFT_CT_SOURCES "complex_fwd_usm_mklcpu_cufft")
 endif()
 
 include(WarningsUtils)
 
-foreach(dfti_backend ${DFTI_CT_BACKENDS})
-  set(EXAMPLE_NAME example_dft_complex_fwd_buffer_${dfti_backend})
-  add_executable(${EXAMPLE_NAME} complex_fwd_buffer_${dfti_backend}.cpp)
+foreach(dft_ct_source ${DFT_CT_SOURCES})
+  set(EXAMPLE_NAME example_${domain}_${dft_ct_source})
+  add_executable(${EXAMPLE_NAME} ${dft_ct_source}.cpp)
   target_include_directories(${EXAMPLE_NAME}
       PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
       PUBLIC ${PROJECT_SOURCE_DIR}/include
       PUBLIC ${CMAKE_BINARY_DIR}/bin
   )
 
-  add_dependencies(${EXAMPLE_NAME} onemkl_dft_${dfti_backend})
-  target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_dft_${dfti_backend} onemkl_warnings)
+if(domain STREQUAL "dft" AND ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND)
+  add_dependencies(${EXAMPLE_NAME} onemkl_${domain}_mklcpu onemkl_${domain}_cufft)
+  list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklcpu onemkl_${domain}_cufft)
+endif()
+
+target_link_libraries(${EXAMPLE_NAME} PUBLIC
+  ${ONEMKL_LIBRARIES_${domain}}
+  ONEMKL::SYCL::SYCL
+  onemkl_warnings
+  )
 
   # Register example as ctest
-  add_test(NAME dft/EXAMPLE/CT/complex_fwd_buffer_${dfti_backend} COMMAND ${EXAMPLE_NAME})
-endforeach(dfti_backend)
+  add_test(NAME dft/EXAMPLE/CT/${dft_ct_source} COMMAND ${EXAMPLE_NAME})
+
+endforeach(dft_ct_source)
 
diff --git a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp b/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp
deleted file mode 100644
index cb6e85ffa..000000000
--- a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions
-* and limitations under the License.
-*
-*
-* SPDX-License-Identifier: Apache-2.0
-*******************************************************************************/
-
-// STL includes
-#include <iostream>
-
-// oneMKL/SYCL includes
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#else
-#include <CL/sycl.hpp>
-#endif
-#include "oneapi/mkl.hpp"
-
-void run_example(const sycl::device& cpu_device) {
-    constexpr int N = 10;
-
-    // Catch asynchronous exceptions for cpu
-    auto cpu_error_handler = [&](sycl::exception_list exceptions) {
-        for (auto const& e : exceptions) {
-            try {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const& e) {
-                // Handle not dft related exceptions that happened during asynchronous call
-                std::cerr << "Caught asynchronous SYCL exception:" << std::endl;
-                std::cerr << "\t" << e.what() << std::endl;
-            }
-        }
-        std::exit(2);
-    };
-
-    sycl::queue cpu_queue(cpu_device, cpu_error_handler);
-
-    std::vector<std::complex<double>> input_data(N);
-    std::vector<std::complex<double>> output_data(N);
-
-    // enabling
-    // 1. create descriptors
-    oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE,
-                                 oneapi::mkl::dft::domain::COMPLEX>
-        desc(N);
-
-    // 2. variadic set_value
-    desc.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                   oneapi::mkl::dft::config_value::NOT_INPLACE);
-    desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                   static_cast<std::int64_t>(1));
-
-    // 3. commit_descriptor (compile_time MKLCPU)
-    desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue });
-
-    // 4. compute_forward / compute_backward (MKLCPU)
-    {
-        sycl::buffer<std::complex<double>> input_buffer(input_data.data(), sycl::range<1>(N));
-        sycl::buffer<std::complex<double>> output_buffer(output_data.data(), sycl::range<1>(N));
-        oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<double>,
-                                          std::complex<double>>(desc, input_buffer, output_buffer);
-    }
-}
-
-//
-// Description of example setup, apis used and supported floating point type precisions
-//
-void print_example_banner() {
-    std::cout << "\n"
-                 "########################################################################\n"
-                 "# Complex out-of-place forward transform for Buffer API's example:\n"
-                 "#\n"
-                 "# Using APIs:\n"
-                 "#   Compile-time dispatch API\n"
-                 "#   Buffer forward complex out-of-place\n"
-                 "#\n"
-                 "# Using double precision (double) data type\n"
-                 "#\n"
-                 "# For Intel CPU with Intel MKLCPU backend.\n"
-                 "#\n"
-                 "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify\n"
-                 "# available devices\n"
-                 "########################################################################\n"
-              << std::endl;
-}
-
-//
-// Main entry point for example.
-//
-int main() {
-    print_example_banner();
-
-    try {
-        sycl::device cpu_device((sycl::cpu_selector_v));
-        std::cout << "Running DFT Complex forward out-of-place buffer example" << std::endl;
-        std::cout << "Using compile-time dispatch API with MKLCPU." << std::endl;
-        std::cout << "Running with double precision real data type on:" << std::endl;
-        std::cout << "\tCPU device :" << cpu_device.get_info<sycl::info::device::name>()
-                  << std::endl;
-
-        run_example(cpu_device);
-        std::cout << "DFT Complex USM example ran OK on MKLCPU" << std::endl;
-    }
-    catch (sycl::exception const& e) {
-        // Handle not dft related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous SYCL exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
-        return 1;
-    }
-    catch (std::exception const& e) {
-        // Handle not SYCL related exceptions that happened during synchronous call
-        std::cerr << "Caught synchronous std::exception:" << std::endl;
-        std::cerr << "\t" << e.what() << std::endl;
-        return 1;
-    }
-
-    return 0;
-}
diff --git a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklgpu.cpp b/examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp
similarity index 53%
rename from examples/dft/compile_time_dispatching/complex_fwd_buffer_mklgpu.cpp
rename to examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp
index 4c243569b..59c810f3f 100644
--- a/examples/dft/compile_time_dispatching/complex_fwd_buffer_mklgpu.cpp
+++ b/examples/dft/compile_time_dispatching/complex_fwd_usm_mklcpu_cufft.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,10 +27,26 @@
 #include <CL/sycl.hpp>
 #endif
 #include "oneapi/mkl.hpp"
+#include <complex>
 
-void run_example(const sycl::device& gpu_device) {
+void run_example(const sycl::device& cpu_device, const sycl::device& gpu_device) {
     constexpr std::size_t N = 10;
 
+    // Catch asynchronous exceptions for cpu
+    auto cpu_error_handler = [&](sycl::exception_list exceptions) {
+        for (auto const& e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const& e) {
+                // Handle not dft related exceptions that happened during asynchronous call
+                std::cerr << "Caught asynchronous SYCL exception on CPU device during execution:"
+                          << std::endl;
+                std::cerr << "\t" << e.what() << std::endl;
+            }
+        }
+        std::exit(2);
+    };
     // Catch asynchronous exceptions for gpu
     auto gpu_error_handler = [&](sycl::exception_list exceptions) {
         for (auto const& e : exceptions) {
@@ -39,17 +55,30 @@ void run_example(const sycl::device& gpu_device) {
             }
             catch (sycl::exception const& e) {
                 // Handle not dft related exceptions that happened during asynchronous call
-                std::cerr << "Caught asynchronous SYCL exception:" << std::endl;
+                std::cerr << "Caught asynchronous SYCL exception on GPU device during execution:"
+                          << std::endl;
                 std::cerr << "\t" << e.what() << std::endl;
             }
         }
         std::exit(2);
     };
 
+    // Preparation CPU device and GPU device
+    sycl::queue cpu_queue(cpu_device, cpu_error_handler);
     sycl::queue gpu_queue(gpu_device, gpu_error_handler);
 
-    std::vector<std::complex<float>> input_data(N);
-    std::vector<std::complex<float>> output_data(N);
+    // allocate on CPU device and GPU device
+    auto cpu_input_data = sycl::malloc_shared<std::complex<float>>(N, cpu_queue);
+    auto cpu_output_data = sycl::malloc_shared<std::complex<float>>(N, cpu_queue);
+
+    auto gpu_input_data = sycl::malloc_shared<std::complex<float>>(N, gpu_queue);
+    auto gpu_output_data = sycl::malloc_shared<std::complex<float>>(N, gpu_queue);
+
+    // Initialize input data
+    for (std::size_t i = 0; i < N; ++i) {
+        cpu_input_data[i] = { static_cast<float>(i), static_cast<float>(-i) };
+        gpu_input_data[i] = { static_cast<float>(i), static_cast<float>(-i) };
+    }
 
     // enabling
     // 1. create descriptors
@@ -63,16 +92,27 @@ void run_example(const sycl::device& gpu_device) {
     desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
                    static_cast<std::int64_t>(1));
 
-    // 3. commit_descriptor (compile_time MKLGPU)
-    desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklgpu>{ gpu_queue });
+    // 3a. commit_descriptor (compile_time MKLCPU)
+    desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue });
 
-    // 4. compute_forward / compute_backward (MKLGPU)
-    {
-        sycl::buffer<std::complex<float>> input_buffer(input_data.data(), sycl::range<1>(N));
-        sycl::buffer<std::complex<float>> output_buffer(output_data.data(), sycl::range<1>(N));
-        oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
-            desc, input_buffer, output_buffer);
-    }
+    // 4a. compute_forward / compute_backward (MKLCPU)
+    oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
+        desc, cpu_input_data, cpu_output_data);
+
+    // 3b. commit_descriptor (compile_time cuFFT)
+    desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::cufft>{ gpu_queue });
+
+    // 4b. compute_forward / compute_backward (cuFFT)
+    oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
+        desc, gpu_input_data, gpu_output_data);
+
+    cpu_queue.wait_and_throw();
+    gpu_queue.wait_and_throw();
+
+    sycl::free(cpu_input_data, cpu_queue);
+    sycl::free(gpu_input_data, gpu_queue);
+    sycl::free(cpu_output_data, cpu_queue);
+    sycl::free(gpu_output_data, gpu_queue);
 }
 
 //
@@ -81,18 +121,16 @@ void run_example(const sycl::device& gpu_device) {
 void print_example_banner() {
     std::cout << "\n"
                  "########################################################################\n"
-                 "# Complex out-of-place forward transform for Buffer API's example:\n"
+                 "# Complex out-of-place forward transform for USM API's example:\n"
                  "#\n"
                  "# Using APIs:\n"
                  "#   Compile-time dispatch API\n"
-                 "#   Buffer forward complex out-of-place\n"
+                 "#   USM forward complex out-of-place\n"
                  "#\n"
                  "# Using single precision (float) data type\n"
                  "#\n"
-                 "# For Intel GPU with Intel MKLGPU backend.\n"
+                 "# Running on both Intel CPU and NVIDIA GPU devices.\n"
                  "#\n"
-                 "# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify\n"
-                 "# available devices\n"
                  "########################################################################\n"
               << std::endl;
 }
@@ -104,15 +142,25 @@ int main(int /*argc*/, char** /*argv*/) {
     print_example_banner();
 
     try {
+        sycl::device cpu_device((sycl::cpu_selector_v));
         sycl::device gpu_device((sycl::gpu_selector_v));
-        std::cout << "Running DFT Complex forward out-of-place buffer example" << std::endl;
-        std::cout << "Using compile-time dispatch API with MKLGPU." << std::endl;
+
+        unsigned int vendor_id = gpu_device.get_info<sycl::info::device::vendor_id>();
+        if (vendor_id != NVIDIA_ID) {
+            std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
+            return 1;
+        }
+
+        std::cout << "Running DFT Complex forward out-of-place usm example" << std::endl;
+        std::cout << "Using compile-time dispatch API with MKLCPU and cuFFT." << std::endl;
         std::cout << "Running with single precision real data type on:" << std::endl;
+        std::cout << "\tCPU device: " << cpu_device.get_info<sycl::info::device::name>()
+                  << std::endl;
         std::cout << "\tGPU device :" << gpu_device.get_info<sycl::info::device::name>()
                   << std::endl;
 
-        run_example(gpu_device);
-        std::cout << "DFT Complex USM example ran OK on MKLGPU" << std::endl;
+        run_example(cpu_device, gpu_device);
+        std::cout << "DFT Complex USM example ran OK on MKLCPU and CUFFT" << std::endl;
     }
     catch (sycl::exception const& e) {
         // Handle not dft related exceptions that happened during synchronous call
diff --git a/examples/dft/run_time_dispatching/CMakeLists.txt b/examples/dft/run_time_dispatching/CMakeLists.txt
index 6d9a8dd24..e221c7950 100644
--- a/examples/dft/run_time_dispatching/CMakeLists.txt
+++ b/examples/dft/run_time_dispatching/CMakeLists.txt
@@ -27,13 +27,16 @@ set(DFT_RT_SOURCES "")
 # If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
 # overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
 set(DEVICE_FILTERS "")
-if(ENABLE_MKLGPU_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_ROCFFT_BACKEND OR ENABLE_PORTFFT_BACKEND)
+if(ENABLE_MKLGPU_BACKEND OR ENABLE_MKLCPU_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_ROCFFT_BACKEND OR ENABLE_PORTFFT_BACKEND)
   list(APPEND DFT_RT_SOURCES "real_fwd_usm")
 endif()
 
 if(ENABLE_MKLGPU_BACKEND)
   list(APPEND DEVICE_FILTERS "level_zero:gpu")
 endif()
+if(ENABLE_MKLCPU_BACKEND)
+  list(APPEND DEVICE_FILTERS "opencl:cpu")
+endif()
 if(ENABLE_PORTFFT_BACKEND)
   list(APPEND DEVICE_FILTERS "*:gpu")
 endif()