Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DFT][Examples] Make the oneMKL DFT example consistent with the other domains #518

Merged
merged 9 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@ oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the followin
- blas: level3/gemm_usm
- rng: uniform_usm
- lapack: getrs_usm
- dft: complex_fwd_buffer, real_fwd_usm
- dft: complex_fwd_usm, real_fwd_usm
mkrainiuk marked this conversation as resolved.
Show resolved Hide resolved
- sparse_blas: sparse_gemv_usm

Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively.

To build examples, use cmake build option `-DBUILD_EXAMPLES=true`.
Compile_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and cuda backend is enabled, because the compile-time dispatching example runs on both mklcpu and cuda backends.
Run_time_dispatching will be built if `-DBUILD_EXAMPLES=true` and `-DBUILD_SHARED_LIBS=true`.
All DFT examples require the mklgpu backend to be enabled.

The example executable naming convention follows `example_<$domain>_<$routine>_<$backend>` for compile-time dispatching examples
or `example_<$domain>_<$routine>` for run-time dispatching examples.
Expand Down
34 changes: 19 additions & 15 deletions examples/dft/compile_time_dispatching/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,31 +18,35 @@
#===============================================================================

#Build object from all sources
set(DFTI_CT_BACKENDS "")

if(ENABLE_MKLGPU_BACKEND)
list(APPEND DFTI_CT_BACKENDS "mklgpu")
endif()

if(ENABLE_MKLCPU_BACKEND)
list(APPEND DFTI_CT_BACKENDS "mklcpu")
set(DFT_CT_SOURCES "")
if (ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND)
list(APPEND DFT_CT_SOURCES "complex_fwd_usm_mklcpu_cufft")
endif()

include(WarningsUtils)
Rbiessy marked this conversation as resolved.
Show resolved Hide resolved

foreach(dfti_backend ${DFTI_CT_BACKENDS})
set(EXAMPLE_NAME example_dft_complex_fwd_buffer_${dfti_backend})
add_executable(${EXAMPLE_NAME} complex_fwd_buffer_${dfti_backend}.cpp)
foreach(dft_ct_source ${DFT_CT_SOURCES})
set(EXAMPLE_NAME example_${domain}_${dft_ct_source})
add_executable(${EXAMPLE_NAME} ${dft_ct_source}.cpp)
target_include_directories(${EXAMPLE_NAME}
PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
PUBLIC ${PROJECT_SOURCE_DIR}/include
PUBLIC ${CMAKE_BINARY_DIR}/bin
)

add_dependencies(${EXAMPLE_NAME} onemkl_dft_${dfti_backend})
target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_dft_${dfti_backend} onemkl_warnings)
if(domain STREQUAL "dft" AND ENABLE_MKLCPU_BACKEND AND ENABLE_CUFFT_BACKEND)
add_dependencies(${EXAMPLE_NAME} onemkl_${domain}_mklcpu onemkl_${domain}_cufft)
list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_mklcpu onemkl_${domain}_cufft)
endif()

target_link_libraries(${EXAMPLE_NAME} PUBLIC
${ONEMKL_LIBRARIES_${domain}}
ONEMKL::SYCL::SYCL
onemkl_warnings
)

# Register example as ctest
add_test(NAME dft/EXAMPLE/CT/complex_fwd_buffer_${dfti_backend} COMMAND ${EXAMPLE_NAME})
endforeach(dfti_backend)
add_test(NAME dft/EXAMPLE/CT/${dft_ct_source} COMMAND ${EXAMPLE_NAME})

endforeach(dft_ct_source)

132 changes: 0 additions & 132 deletions examples/dft/compile_time_dispatching/complex_fwd_buffer_mklcpu.cpp

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2023 Intel Corporation
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,10 +27,26 @@
#include <CL/sycl.hpp>
#endif
#include "oneapi/mkl.hpp"
#include <complex>

void run_example(const sycl::device& gpu_device) {
void run_example(const sycl::device& cpu_device, const sycl::device& gpu_device) {
constexpr std::size_t N = 10;

// Catch asynchronous exceptions for cpu
auto cpu_error_handler = [&](sycl::exception_list exceptions) {
for (auto const& e : exceptions) {
try {
std::rethrow_exception(e);
}
catch (sycl::exception const& e) {
// Handle not dft related exceptions that happened during asynchronous call
std::cerr << "Caught asynchronous SYCL exception on CPU device during execution:"
<< std::endl;
std::cerr << "\t" << e.what() << std::endl;
}
}
std::exit(2);
};
// Catch asynchronous exceptions for gpu
auto gpu_error_handler = [&](sycl::exception_list exceptions) {
for (auto const& e : exceptions) {
Expand All @@ -39,17 +55,30 @@ void run_example(const sycl::device& gpu_device) {
}
catch (sycl::exception const& e) {
// Handle not dft related exceptions that happened during asynchronous call
std::cerr << "Caught asynchronous SYCL exception:" << std::endl;
std::cerr << "Caught asynchronous SYCL exception on GPU device during execution:"
<< std::endl;
std::cerr << "\t" << e.what() << std::endl;
}
}
std::exit(2);
};

// Preparation CPU device and GPU device
sycl::queue cpu_queue(cpu_device, cpu_error_handler);
sycl::queue gpu_queue(gpu_device, gpu_error_handler);

std::vector<std::complex<float>> input_data(N);
std::vector<std::complex<float>> output_data(N);
// allocate on CPU device and GPU device
auto cpu_input_data = sycl::malloc_shared<std::complex<float>>(N, cpu_queue);
auto cpu_output_data = sycl::malloc_shared<std::complex<float>>(N, cpu_queue);

auto gpu_input_data = sycl::malloc_shared<std::complex<float>>(N, gpu_queue);
auto gpu_output_data = sycl::malloc_shared<std::complex<float>>(N, gpu_queue);

// Initialize input data
for (std::size_t i = 0; i < N; ++i) {
cpu_input_data[i] = { static_cast<float>(i), static_cast<float>(-i) };
gpu_input_data[i] = { static_cast<float>(i), static_cast<float>(-i) };
}

// enabling
// 1. create descriptors
Expand All @@ -63,16 +92,27 @@ void run_example(const sycl::device& gpu_device) {
desc.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
static_cast<std::int64_t>(1));

// 3. commit_descriptor (compile_time MKLGPU)
desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklgpu>{ gpu_queue });
// 3a. commit_descriptor (compile_time MKLCPU)
desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu>{ cpu_queue });

// 4. compute_forward / compute_backward (MKLGPU)
{
sycl::buffer<std::complex<float>> input_buffer(input_data.data(), sycl::range<1>(N));
sycl::buffer<std::complex<float>> output_buffer(output_data.data(), sycl::range<1>(N));
oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
desc, input_buffer, output_buffer);
}
// 4a. compute_forward / compute_backward (MKLCPU)
oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
desc, cpu_input_data, cpu_output_data);

// 3b. commit_descriptor (compile_time cuFFT)
desc.commit(oneapi::mkl::backend_selector<oneapi::mkl::backend::cufft>{ gpu_queue });

// 4b. compute_forward / compute_backward (cuFFT)
oneapi::mkl::dft::compute_forward<decltype(desc), std::complex<float>, std::complex<float>>(
desc, gpu_input_data, gpu_output_data);

cpu_queue.wait_and_throw();
gpu_queue.wait_and_throw();

sycl::free(cpu_input_data, cpu_queue);
sycl::free(gpu_input_data, gpu_queue);
sycl::free(cpu_output_data, cpu_queue);
sycl::free(gpu_output_data, gpu_queue);
}

//
Expand All @@ -81,18 +121,16 @@ void run_example(const sycl::device& gpu_device) {
void print_example_banner() {
std::cout << "\n"
"########################################################################\n"
"# Complex out-of-place forward transform for Buffer API's example:\n"
"# Complex out-of-place forward transform for USM API's example:\n"
"#\n"
"# Using APIs:\n"
"# Compile-time dispatch API\n"
"# Buffer forward complex out-of-place\n"
"# USM forward complex out-of-place\n"
"#\n"
"# Using single precision (float) data type\n"
"#\n"
"# For Intel GPU with Intel MKLGPU backend.\n"
"# Running on both Intel CPU and NVIDIA GPU devices.\n"
"#\n"
"# The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify\n"
"# available devices\n"
"########################################################################\n"
<< std::endl;
}
Expand All @@ -104,15 +142,25 @@ int main(int /*argc*/, char** /*argv*/) {
print_example_banner();

try {
sycl::device cpu_device((sycl::cpu_selector_v));
sycl::device gpu_device((sycl::gpu_selector_v));
std::cout << "Running DFT Complex forward out-of-place buffer example" << std::endl;
std::cout << "Using compile-time dispatch API with MKLGPU." << std::endl;

unsigned int vendor_id = gpu_device.get_info<sycl::info::device::vendor_id>();
if (vendor_id != NVIDIA_ID) {
std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl;
return 1;
}

std::cout << "Running DFT Complex forward out-of-place usm example" << std::endl;
std::cout << "Using compile-time dispatch API with MKLCPU and cuFFT." << std::endl;
std::cout << "Running with single precision real data type on:" << std::endl;
std::cout << "\tCPU device: " << cpu_device.get_info<sycl::info::device::name>()
<< std::endl;
std::cout << "\tGPU device :" << gpu_device.get_info<sycl::info::device::name>()
<< std::endl;

run_example(gpu_device);
std::cout << "DFT Complex USM example ran OK on MKLGPU" << std::endl;
run_example(cpu_device, gpu_device);
std::cout << "DFT Complex USM example ran OK on MKLCPU and CUFFT" << std::endl;
}
catch (sycl::exception const& e) {
// Handle not dft related exceptions that happened during synchronous call
Expand Down
5 changes: 4 additions & 1 deletion examples/dft/run_time_dispatching/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,16 @@ set(DFT_RT_SOURCES "")
# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
# overwrite ONEAPI_DEVICE_SELECTOR in their environment to run on the desired backend
set(DEVICE_FILTERS "")
if(ENABLE_MKLGPU_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_ROCFFT_BACKEND OR ENABLE_PORTFFT_BACKEND)
if(ENABLE_MKLGPU_BACKEND OR ENABLE_MKLCPU_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_ROCFFT_BACKEND OR ENABLE_PORTFFT_BACKEND)
list(APPEND DFT_RT_SOURCES "real_fwd_usm")
endif()

if(ENABLE_MKLGPU_BACKEND)
list(APPEND DEVICE_FILTERS "level_zero:gpu")
endif()
if(ENABLE_MKLCPU_BACKEND)
list(APPEND DEVICE_FILTERS "opencl:cpu")
endif()
if(ENABLE_PORTFFT_BACKEND)
list(APPEND DEVICE_FILTERS "*:gpu")
endif()
Expand Down