Merge branch 'romain/update_sparse_mkl' into romain/cusparse

oneapi-src · Jul 19, 2024 · 2fbd6d2 · 2fbd6d2
2 parents 52becf2 + 2f59edc
commit 2fbd6d2
Show file tree

Hide file tree

Showing 101 changed files with 347 additions and 352 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 # oneAPI Math Kernel Library (oneMKL) Interfaces
 
-oneMKL Interfaces is an open-source implementation of the oneMKL Data Parallel C++ (DPC++) interface according to the [oneMKL specification](https://spec.oneapi.com/versions/latest/elements/oneMKL/source/index.html). It works with multiple devices (backends) using device-specific libraries underneath.
+oneMKL Interfaces is an open-source implementation of the oneMKL Data Parallel C++ (DPC++) interface according to the [oneMKL specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemkl/source/). It works with multiple devices (backends) using device-specific libraries underneath.
 
 oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org).
 <br/><br/>
@@ -525,7 +525,8 @@ Product | Supported Version | License
 - [About](https://oneapi-src.github.io/oneMKL/introduction.html)
 - Get Started
   - [Selecting a Compiler](https://oneapi-src.github.io/oneMKL/selecting_a_compiler.html)
-  - [Building the Project](https://oneapi-src.github.io/oneMKL/building_the_project.html)
+  - [Building the Project with DPC++](https://oneapi-src.github.io/oneMKL/building_the_project_with_dpcpp.html)
+  - [Building the Project with AdaptiveCpp](https://oneapi-src.github.io/oneMKL/building_the_project_with_adaptivecpp.html)
 - Developer Reference
   - [oneMKL Defined Datatypes](https://oneapi-src.github.io/oneMKL/onemkl-datatypes.html)
   - [Dense Linear Algebra](https://oneapi-src.github.io/oneMKL/domains/dense_linear_algebra.html)

diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst
@@ -344,7 +344,7 @@ The following table provides details of CMake options and their default values:
 CMake invocation examples
 ##########################
 
-Build oneMKL with support for x86 CPU, Intel GPU, and Nvidia GPUs with tests
+Build oneMKL with support for Nvidia GPUs with tests
 disabled using the Ninja build system:
 
 .. code-block:: bash
@@ -367,7 +367,7 @@ and Intel GPU (``MKLGPU``) backends are enabled by default, but are disabled
 here. The backends for Nvidia GPUs must all be explicilty enabled. The tests are
 disabled, but the examples will still be built.
 
-Building oneMKL with support x86 CPU, Intel GPU, and AMD GPUs with tests
+Building oneMKL with support for AMD GPUs with tests
 disabled:
 
 .. code-block:: bash
@@ -405,7 +405,7 @@ GPU and Nvidia GPU with testing enabled:
 Note that this is not a supported configuration, and requires Codeplay's oneAPI
 for `AMD <https://developer.codeplay.com/products/oneapi/amd/home/>`_ and
 `Nvidia <https://developer.codeplay.com/products/oneapi/nvidia/home/>`_ GPU
-plugins. Like the above example, the MKLCPU and MKLGPU backends are enabled by
+plugins. The MKLCPU and MKLGPU backends are enabled by
 default, with backends for Nvidia GPU and AMD GPU explicitly enabled.
 ``-DTARGET_DOMAINS=dft`` causes only DFT backends to be built. If this was not
 set, the backend libraries to enable the use of BLAS, LAPACK and RNG with MKLGPU

diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst
@@ -29,6 +29,9 @@ Currently known limitations:
   ``oneapi::mkl::transpose::conjtrans`` and has the
   ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw an
   ``oneapi::mkl::unimplemented`` exception.
+- Using ``spsv`` on Intel GPU with a sparse matrix that is
+  ``oneapi::mkl::transpose::conjtrans`` and will throw an
+  ``oneapi::mkl::unimplemented`` exception.
 - Scalar parameters ``alpha`` and ``beta`` should be host pointers to prevent
   synchronizations and copies to the host.
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -2,7 +2,7 @@ accessible-pygments==0.0.5
 alabaster==0.7.16
 Babel==2.15.0
 beautifulsoup4==4.12.3
-certifi==2024.2.2
+certifi==2024.7.4
 charset-normalizer==3.3.2
 docutils==0.21.2
 idna==3.7

diff --git a/examples/rng/device/include/rng_example_helper.hpp b/examples/rng/device/include/rng_example_helper.hpp
@@ -29,12 +29,22 @@ struct has_member_code_meta<T, std::void_t<decltype(std::declval<T>().get_multi_
 
 template <typename T, typename std::enable_if<has_member_code_meta<T>::value>::type* = nullptr>
 auto get_multi_ptr(T acc) {
+// Workaround for AdaptiveCPP, as they do not yet support the get_multi_ptr function
+#ifndef __HIPSYCL__
     return acc.get_multi_ptr();
+#else
+    return acc.get_pointer();
+#endif
 };
 
 template <typename T, typename std::enable_if<!has_member_code_meta<T>::value>::type* = nullptr>
 auto get_multi_ptr(T acc) {
+// Workaround for AdaptiveCPP, as they do not yet support the get_multi_ptr function
+#ifndef __HIPSYCL__
     return acc.template get_multi_ptr<sycl::access::decorated::yes>();
+#else
+    return acc.get_pointer();
+#endif
 };
 
 #endif // _RNG_EXAMPLE_HELPER_HPP__
diff --git a/examples/sparse_blas/run_time_dispatching/sparse_blas_spmv_usm.cpp b/examples/sparse_blas/run_time_dispatching/sparse_blas_spmv_usm.cpp
@@ -148,8 +148,10 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) {
     // Create and initialize dense vector handles
     oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr;
     oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr;
-    oneapi::mkl::sparse::init_dense_vector(main_queue, &x_handle, static_cast<std::int64_t>(sizevec), x);
-    oneapi::mkl::sparse::init_dense_vector(main_queue, &y_handle, static_cast<std::int64_t>(sizevec), y);
+    oneapi::mkl::sparse::init_dense_vector(main_queue, &x_handle,
+                                           static_cast<std::int64_t>(sizevec), x);
+    oneapi::mkl::sparse::init_dense_vector(main_queue, &y_handle,
+                                           static_cast<std::int64_t>(sizevec), y);
 
     // Create operation descriptor
     oneapi::mkl::sparse::spmv_descr_t descr = nullptr;

diff --git a/src/blas/backends/rocblas/rocblas_level3.cpp b/src/blas/backends/rocblas/rocblas_level3.cpp
@@ -381,10 +381,17 @@ inline void trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lowe
             auto a_ = sc.get_mem<rocDataType *>(a_acc);
             auto b_ = sc.get_mem<rocDataType *>(b_acc);
             rocblas_status err;
+#if ROCBLAS_VERSION_MAJOR >= 4
+            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
+                                    get_rocblas_fill_mode(upper_lower),
+                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
+                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb, b_, ldb);
+#else
             ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
                                     get_rocblas_fill_mode(upper_lower),
                                     get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
                                     m, n, (rocDataType *)&alpha, a_, lda, b_, ldb);
+#endif
         });
     });
 }
@@ -805,10 +812,17 @@ inline sycl::event trmm(Func func, sycl::queue &queue, side left_right, uplo upp
             auto a_ = reinterpret_cast<const rocDataType *>(a);
             auto b_ = reinterpret_cast<rocDataType *>(b);
             rocblas_status err;
+#if ROCBLAS_VERSION_MAJOR >= 4
+            ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
+                                    get_rocblas_fill_mode(upper_lower),
+                                    get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
+                                    m, n, (rocDataType *)&alpha, a_, lda, b_, ldb, b_, ldb);
+#else
             ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right),
                                     get_rocblas_fill_mode(upper_lower),
                                     get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag),
                                     m, n, (rocDataType *)&alpha, a_, lda, b_, ldb);
+#endif
         });
     });
 

diff --git a/src/rng/backends/mklcpu/cpu_common.hpp b/src/rng/backends/mklcpu/cpu_common.hpp
@@ -56,9 +56,14 @@ class kernel_name {};
 template <typename Engine, typename Distr>
 class kernel_name_usm {};
 
-template <typename T, sycl::access_mode AccMode>
-T *get_raw_ptr(sycl::accessor<T, 1, AccMode> acc) {
+template <typename Acc>
+typename Acc::value_type *get_raw_ptr(Acc acc) {
+// Workaround for AdaptiveCPP, as they do not yet support the get_multi_ptr function
+#ifndef __HIPSYCL__
     return acc.template get_multi_ptr<sycl::access::decorated::no>().get_raw();
+#else
+    return acc.get_pointer();
+#endif
 }
 
 } // namespace mklcpu

diff --git a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp
@@ -154,13 +154,10 @@ inline void set_matrix_attributes(const std::string& func_name, cusparseSpMatDes
 
 /**
  * cuSPARSE requires to set the pointer mode for scalars parameters (typically alpha and beta).
- * This seems needed only for compute functions which dereference the pointer.
  */
-template <typename fpType>
-void set_pointer_mode(cusparseHandle_t cu_handle, sycl::queue queue, fpType* ptr) {
-    cusparseSetPointerMode(cu_handle, detail::is_ptr_accessible_on_host(queue, ptr)
-                                          ? CUSPARSE_POINTER_MODE_HOST
-                                          : CUSPARSE_POINTER_MODE_DEVICE);
+inline void set_pointer_mode(cusparseHandle_t cu_handle, bool is_ptr_host_accessible) {
+    cusparseSetPointerMode(cu_handle, is_ptr_host_accessible ? CUSPARSE_POINTER_MODE_HOST
+                                                             : CUSPARSE_POINTER_MODE_DEVICE);
 }
 
 } // namespace oneapi::mkl::sparse::cusparse

diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp
@@ -77,8 +77,10 @@ void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mk
                       oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
                       oneapi::mkl::sparse::spmm_alg alg,
                       oneapi::mkl::sparse::spmm_descr_t spmm_descr, std::size_t& temp_buffer_size) {
-    detail::check_valid_spmm_common(__func__, queue, A_view, A_handle, B_handle, C_handle, alpha,
-                                    beta);
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
     fallback_alg_if_needed(alg, opA, opB);
     auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler& sc) {
         auto cu_handle = sc.get_handle(queue);
@@ -90,7 +92,7 @@ void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mk
         auto cu_op_b = get_cuda_operation(type, opB);
         auto cu_type = get_cuda_value_type(type);
         auto cu_alg = get_cuda_spmm_alg(alg);
-        set_pointer_mode(cu_handle, queue, alpha);
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
         auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
                                               cu_c, cu_type, cu_alg, &temp_buffer_size);
         check_status(status, __func__);
@@ -105,7 +107,8 @@ void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA,
                         oneapi::mkl::sparse::matrix_handle_t A_handle,
                         oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta,
                         oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
-                        oneapi::mkl::sparse::spmm_alg alg, void* workspace_ptr) {
+                        oneapi::mkl::sparse::spmm_alg alg, void* workspace_ptr,
+                        bool is_alpha_host_accessible) {
     auto cu_a = A_handle->backend_handle;
     auto cu_b = B_handle->backend_handle;
     auto cu_c = C_handle->backend_handle;
@@ -114,6 +117,7 @@ void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA,
     auto cu_op_b = get_cuda_operation(type, opB);
     auto cu_type = get_cuda_value_type(type);
     auto cu_alg = get_cuda_spmm_alg(alg);
+    set_pointer_mode(cu_handle, is_alpha_host_accessible);
     auto status = cusparseSpMM_preprocess(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta,
                                           cu_c, cu_type, cu_alg, workspace_ptr);
     check_status(status, "optimize_spmm");
@@ -126,8 +130,10 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::
                    oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
                    oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr,
                    sycl::buffer<std::uint8_t, 1> workspace) {
-    detail::check_valid_spmm_common(__func__, queue, A_view, A_handle, B_handle, C_handle, alpha,
-                                    beta);
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
     if (!A_handle->all_use_buffer()) {
         detail::throw_incompatible_container(__func__);
     }
@@ -143,7 +149,7 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::
         auto cu_handle = sc.get_handle(queue);
         auto workspace_ptr = sc.get_mem(workspace_acc);
         spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg,
-                           workspace_ptr);
+                           workspace_ptr, is_alpha_host_accessible);
     };
 
     sycl::accessor<std::uint8_t, 1> workspace_placeholder_acc(workspace);
@@ -161,8 +167,10 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
                           oneapi::mkl::sparse::spmm_alg alg,
                           oneapi::mkl::sparse::spmm_descr_t spmm_descr, void* workspace,
                           const std::vector<sycl::event>& dependencies) {
-    detail::check_valid_spmm_common(__func__, queue, A_view, A_handle, B_handle, C_handle, alpha,
-                                    beta);
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
     if (A_handle->all_use_buffer()) {
         detail::throw_incompatible_container(__func__);
     }
@@ -174,9 +182,8 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA,
     fallback_alg_if_needed(alg, opA, opB);
     auto functor = [=](CusparseScopedContextHandler& sc) {
         auto cu_handle = sc.get_handle(queue);
-        set_pointer_mode(cu_handle, queue, alpha);
         spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg,
-                           workspace);
+                           workspace, is_alpha_host_accessible);
     };
 
     return dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle, C_handle);
@@ -189,8 +196,10 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr
                  oneapi::mkl::sparse::dense_matrix_handle_t C_handle,
                  oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr,
                  const std::vector<sycl::event>& dependencies) {
-    detail::check_valid_spmm_common(__func__, queue, A_view, A_handle, B_handle, C_handle, alpha,
-                                    beta);
+    bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha);
+    bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta);
+    detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle,
+                                    is_alpha_host_accessible, is_beta_host_accessible);
     if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) {
         detail::throw_incompatible_container(__func__);
     }
@@ -205,7 +214,7 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr
         auto cu_op_b = get_cuda_operation(type, opB);
         auto cu_type = get_cuda_value_type(type);
         auto cu_alg = get_cuda_spmm_alg(alg);
-        set_pointer_mode(cu_handle, queue, alpha);
+        set_pointer_mode(cu_handle, is_alpha_host_accessible);
         auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c,
                                    cu_type, cu_alg, workspace_ptr);
         check_status(status, __func__);